示例#1
0
    def findContentForBlockId(self, blockid, filename):
        #print(filename)
        blockid = blockid.replace(';', '')
        #print(blockid, filename)
        handler = None
        if filename == self.xmlfileinuse:
            handler = self.xmlfilehandler
        else:
            handler = xmlHandler(inputXmlFile=filename, rootNodeName="alto")
            self.xmlfileinuse = filename
            self.xmlfilehandler = handler

        TBNodes = handler.findAllNodes(
            "Layout/Page/PrintSpace/TextBlock[@ID='%s']" % (blockid))

        contentnodes = handler.findInSub(TBNodes[0], ".//String")
        thestr = ""

        for c in contentnodes:
            if "SUBS_TYPE" in c.attrib and c.attrib["SUBS_TYPE"] == "HypPart1":
                thestr += c.attrib["CONTENT"]
            else:
                thestr += c.attrib["CONTENT"]
                thestr += " "
        #print(thestr.rstrip().lstrip())
        return thestr
示例#2
0
    def __init__(self):
	    self.database = database()
	    self.config = config()
	    self.port = self.config.getLocalMonitorPort()
	    self.xmlHandler = xmlHandler()
	    
	    self.startServer()
示例#3
0
 def FindAllTextBlocksInBook(self, filen, urn):
     blockList = []
     self.handler = xmlHandler(inputXmlFile=filen, rootNodeName="alto")
     blockResult = self.handler.findAllNodes(
         "Layout/Page/PrintSpace/TextBlock")
     for block in blockResult:
         blockList.append(block.attrib['ID'])
     self.findBookTextInTextBlocks(urn, blockList)
示例#4
0
    def processNewspaper(self, paperDir, outputDir):
        NSMETS = 'http://www.loc.gov/METS/'
        self.paperDir = paperDir
        self.errorLogged = False
        for file in glob.glob(self.paperDir + "/*.xml"):
            if "mets.xml" not in file:
                self.filelist.append(file.split("/")[-1])
            else:
                self.altopapermetsfile = file
        self.filelist.sort()
        #print(self.filelist)
        #print(self.filelist[1])
        files = glob.glob(paperDir + "/*mets.xml")
        file = files[0]
        #for file in glob.glob(paperDir +"/*mets.xml"):
        #print(file)
        relativefilename = file.split("/")[-1]
        self.currentUrn = relativefilename.split(
            "_")[0] + "_" + relativefilename.split("_")[3]
        self.altopapermetsfile = file
        self.outputPath = outputDir
        #print("mets file: " + self.altopapermetsfile)
        yearmonthday = relativefilename.split("_")[3]
        year = yearmonthday[0:4]
        self.year = year
        self.month = yearmonthday[4:6]
        self.day = yearmonthday[6:8]
        self.papername = relativefilename.split("_")[0]
        self.outputPath = outputDir + "/" + str(self.year) + "/" + str(
            self.month) + "/" + str(self.day) + "/" + str(self.papername)
        if not os.path.exists(self.outputPath):
            os.makedirs(self.outputPath)
        self.metshandler = xmlHandler(inputXmlFile=file, rootNodeName="alto")
        metsHdr = self.metshandler.findAllNodes("{%s}metsHdr" % (NSMETS))
        dateonly = metsHdr[0].attrib["CREATEDATE"].split("T")

        self.scandate = dateonly[0].split("-")[2] + dateonly[0].split(
            "-")[1] + dateonly[0].split("-")[0]
        #print(self.scandate)

        r = self.metshandler.getRootNode()
        #self.metshandler.printElement(r)
        #NSXLINK = "http://www.loc.gov/METS/ //Produksjon8.nb.no/docWORKS/schema/mets-metae.xsd"
        #attribKey = "{%s}xlink" % (NSXLINK)
        # print(r.attrib[attribKey])
        #print("before")
        self.buildArticleReferences(file)
        #print("after")
        self.writeArticles()
        self.writeMasterMetafile()
示例#5
0
    def findWCForBlockId(self, blockid, filename):
        handler = None
        if filename == self.xmlfileinuse:
            handler = self.xmlfilehandler
        else:
            handler = xmlHandler(inputXmlFile=filename, rootNodeName="alto")
            self.xmlfileinuse = filename
            self.xmlfilehandler = handler

        if self.docworksVersion == "":
            docworksVersionNode = handler.findAllNodes(
                "Description/OCRProcessing/preProcessingStep/processingSoftware/softwareVersion"
            )
            self.docworksVersion = docworksVersionNode[0].text
        if self.abbyyVersion == "":
            abbyyVersionNode = handler.findAllNodes(
                "Description/OCRProcessing/ocrProcessingStep/processingSoftware/softwareVersion"
            )
            if abbyyVersionNode != []:
                self.abbyyVersion = abbyyVersionNode[0].text
            else:
                self.abbyyVersion = "none"

        blockid = blockid.replace(';', '')
        # print(blockid, filename)

        TBNodes = handler.findAllNodes(
            "Layout/Page/PrintSpace/TextBlock[@ID='%s']" % (blockid))

        contentnodes = handler.findInSub(TBNodes[0], ".//String")
        thestr = ""
        localConfidence = 0
        cnt = 0
        cnt98 = 0
        for c in contentnodes:
            if "SUBS_TYPE" in c.attrib and c.attrib["SUBS_TYPE"] == "HypPart1":
                localConfidence += float(c.attrib["WC"])
                cnt += 1
                if localConfidence >= 0.98:
                    cnt98 += 1
            else:
                localConfidence += float(c.attrib["WC"])
                cnt += 1
                if localConfidence >= 0.98:
                    cnt98 += 1
        if cnt == 0:
            return 0, 0
        else:
            return localConfidence / cnt, cnt98 / cnt
    def write_explicitContent_from_result_xml(self, result, outputFilename):

        likely_string = ("Usannsynlig", "Meget Usannsynlig", "Usannsynlig",
                         "Mulig", "Sannsynlig", "Høyst sannsynlig")
        # first result is retrieved because a single video was processed
        xmlHdl = xmlHandler(rootNodeName="Snusk")
        root = xmlHdl.getRootNode()
        for frame in result.annotation_results[0].explicit_annotation.frames:
            frame_time = frame.time_offset.seconds + frame.time_offset.nanos / 1e9
            attrib = {'Bilde_tid': str(round(frame_time, 2))}
            xmlHdl.addSubElement(
                root,
                "Bilde",
                attr=attrib,
                text=format(likely_string[frame.pornography_likelihood]))
        xmlHdl.prettyPrint(outputFilename)
    def write_shots_from_result_xml(self, result, outputFilename):
        # first result is retrieved because a single video was processed
        xmlHdl = xmlHandler(rootNodeName="Scener")
        root = xmlHdl.getRootNode()
        for i, shot in enumerate(
                result.annotation_results[0].shot_annotations):
            start_time = (shot.start_time_offset.seconds +
                          shot.start_time_offset.nanos / 1e9)
            end_time = (shot.end_time_offset.seconds +
                        shot.end_time_offset.nanos / 1e9)

            attrib = {
                'Start_tid': str(round(start_time, 2)),
                'Stopp_tid': str(round(end_time, 2))
            }
            # print(word_info)
            xmlHdl.addSubElement(root,
                                 "Shot",
                                 attr=attrib,
                                 text=format(str(i)))
        # handlerTC.prettyPrintToScreen()
        xmlHdl.prettyPrint(outputFilename)
示例#8
0
 def FindAllTextBlocksInBook(self, filen):
     blockList = []
     self.handler = xmlHandler(inputXmlFile=filen, rootNodeName="alto")
     if self.docworksVersion == "":
         docworksVersionNode = self.handler.findAllNodes(
             "Description/OCRProcessing/preProcessingStep/processingSoftware/softwareVersion"
         )
         self.docworksVersion = docworksVersionNode[0].text
     if self.abbyyVersion == "":
         abbyyVersionNode = self.handler.findAllNodes(
             "Description/OCRProcessing/ocrProcessingStep/processingSoftware/softwareVersion"
         )
         if abbyyVersionNode != []:
             self.abbyyVersion = abbyyVersionNode[0].text
         else:
             self.abbyyVersion = "none"
     #print (self.abbyyVersion +  "        " + self.docworksVersion)
     #self.handler.printElement(self.docworksVersion)
     #findInSub(self, node, match):
     MasterNodes = self.handler.findAllNodes("Layout/Page/PrintSpace")
     for masterNode in MasterNodes:
         blockResult = self.handler.findInSub(masterNode, "TextBlock")
         for block in blockResult:
             self.findTextInBlock(block)
    def write_labels_from_result_xml(self, result, outputFilename):
        xmlHdl = xmlHandler(rootNodeName="Labels")
        root = xmlHdl.getRootNode()
        # xmlHdl.prettyPrintToScreen()
        # Process video/segment level label annotations
        segment_labels = result.annotation_results[0].segment_label_annotations
        for i, segment_label in enumerate(segment_labels):
            #print(format(segment_label.entity.description))
            labelNode = xmlHdl.makeElement(
                "Label", str(segment_label.entity.description))
            xmlHdl.addNode(labelNode)
            for category_entity in segment_label.category_entities:
                catNode = xmlHdl.makeElement("Kategori",
                                             str(category_entity.description))
                xmlHdl.addSubNode(labelNode, catNode)
                #print (str(category_entity.description))
            for i, segment in enumerate(segment_label.segments):
                start_time = (segment.segment.start_time_offset.seconds +
                              segment.segment.start_time_offset.nanos / 1e9)
                end_time = (segment.segment.end_time_offset.seconds +
                            segment.segment.end_time_offset.nanos / 1e9)
                positions = '{}s to {}s'.format(start_time, end_time)
                confidence = segment.confidence
                segmentNode = xmlHdl.makeElement("Segment",
                                                 str(i) + " " + str(positions))
                #print(str(i) + " " + str(positions))
                #print(confidence)
                confidenceNode = xmlHdl.makeElement("Konfidens",
                                                    str(confidence))
                xmlHdl.addSubNode(labelNode, segmentNode)
                xmlHdl.addSubNode(labelNode, confidenceNode)

            # xmlHdl.prettyPrintToScreen()
            # Process shot level label annotations
        shot_labels = result.annotation_results[0].shot_label_annotations
        #xmlHdl2 = xmlHandler(rootNodeName="Shots")
        ShotLabelAnnotNodes = xmlHdl.makeElement("shotAnnotations",
                                                 "shotannotations")
        xmlHdl.addNode(ShotLabelAnnotNodes)
        for i, shot_label in enumerate(shot_labels):
            ShotLabelAnnotNode = xmlHdl.makeElement(
                "shotLabel", str(shot_label.entity.description))
            xmlHdl.addSubNode(ShotLabelAnnotNodes, ShotLabelAnnotNode)
            for category_entity in shot_label.category_entities:
                ShotLabelAnnotNodeCategory = xmlHdl.makeElement(
                    "shotLabelCategory", str(category_entity.description))
                xmlHdl.addSubNode(ShotLabelAnnotNode,
                                  ShotLabelAnnotNodeCategory)

                for i, shot in enumerate(shot_label.segments):
                    start_time = (shot.segment.start_time_offset.seconds +
                                  shot.segment.start_time_offset.nanos / 1e9)
                    end_time = (shot.segment.end_time_offset.seconds +
                                shot.segment.end_time_offset.nanos / 1e9)
                    positions = '{}s to {}s'.format(start_time, end_time)
                    confidence = shot.confidence
                    segmentNodeShot = xmlHdl.makeElement(
                        "Segment", str(str(i) + " " + positions))
                    confidenceNodeShot = xmlHdl.makeElement(
                        "Konfidens", str(confidence))
                    xmlHdl.addSubNode(ShotLabelAnnotNodeCategory,
                                      segmentNodeShot)
                    xmlHdl.addSubNode(ShotLabelAnnotNodeCategory,
                                      confidenceNodeShot)

                # Process frame level label annotations
        frame_labels = result.annotation_results[0].frame_label_annotations
        #xmlHdl3 = xmlHandler(rootNodeName="Frames")
        FrameLabelAnnotNodes = xmlHdl.makeElement("frameAnnotations",
                                                  "frameannotations")
        xmlHdl.addNode(FrameLabelAnnotNodes)
        for i, frame_label in enumerate(frame_labels):
            FrameLabelAnnotNode = xmlHdl.makeElement(
                "frameLabel", str(frame_label.entity.description))
            xmlHdl.addSubNode(FrameLabelAnnotNodes, FrameLabelAnnotNode)
            for category_entity in frame_label.category_entities:
                CategoryLabelAnnotNode = xmlHdl.makeElement(
                    "categoryLabel", str(category_entity.description))
                xmlHdl.addSubNode(FrameLabelAnnotNode, CategoryLabelAnnotNode)

                # Each frame_label_annotation has many frames,
                # here we print information only about the first frame.
                frame = frame_label.frames[0]
                time_offset = (frame.time_offset.seconds +
                               frame.time_offset.nanos / 1e9)
                timeOffsetNode = xmlHdl.makeElement("FirstFrameTimeOffset",
                                                    str(time_offset))
                frameOffsetNode = xmlHdl.makeElement("FrameOffsetConfidence",
                                                     str(frame.confidence))
                xmlHdl.addSubNode(CategoryLabelAnnotNode, timeOffsetNode)
                xmlHdl.addSubNode(CategoryLabelAnnotNode, frameOffsetNode)

        #xmlHdl.printTreeToFile(outputFilename)
        #xmlHdl.prettyPrintToScreen()
        xmlHdl.prettyPrint(outputFilename)
示例#10
0
	def sync(self):

		s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 
		c = config()
		s.connect(( c.getRemoteMonitorServer() , int(c.getRemoteMonitorPort())))
		d = database()
		self.xml = xmlHandler()
		#return

		try: 
			#authenticate the client
			s.send("auth " + c.getRemoteMonitorUser() + " " +  c.getRemoteMonitorPassword() )
			print s.recv(1024)

			#announce our hostname
			s.send("host " + c.getHostname() )
			print s.recv(1024)
		
			#check if all dataContainer are enlisted at the server
			containerList = d.getDataContainer("")
			for container in containerList:
				#ask server if container is already known to the server
				container.updateChecksum()
				request = "checkDataID " + str(container.dataID) + " " + container.checksum
				s.send(request) 
				reply = str( s.recv(1024) )
				print reply
				if reply == "dataID unknown":
					data = self.xml.dataContainerToXml( c.getHostname(), container )
					print "trying to send data: "  + data
					print "Send data, #bytes: " + str( s.sendall("data " + str(len(data)) + " " + data ))

					print "Answer to data:" +  s.recv(1024)
					s.send("commit")
					print "Answer to commit: " + s.recv(1024)


			
			
			s.send("getLastID " + c.getHostname())
			lastId = int( s.recv(1024) )
			print "received lastId" + str(lastId)
			logs = d.getLogs( lastId)

			for i in range(0, len(logs)):
				data = self.xml.logEntryToXml( c.getHostname(), logs[i] )
				origSize = len(data)
				dataPad = len(data) % 1024
				dataPad = 1024 - dataPad
				data = data.ljust( origSize + dataPad, " ")

				print "Size of data: " + str(len(data))
				print "Send data, #bytes: " + str( s.sendall("data " + str(len(data))  + " " + data ))
				print "Answer to data:" +  s.recv(1024)
				s.send("commit")
				print "Answer to commit: " + s.recv(1024)


			s.send("exit")
			print s.recv(1024)
		finally: 
		    s.close()