def findContentForBlockId(self, blockid, filename): #print(filename) blockid = blockid.replace(';', '') #print(blockid, filename) handler = None if filename == self.xmlfileinuse: handler = self.xmlfilehandler else: handler = xmlHandler(inputXmlFile=filename, rootNodeName="alto") self.xmlfileinuse = filename self.xmlfilehandler = handler TBNodes = handler.findAllNodes( "Layout/Page/PrintSpace/TextBlock[@ID='%s']" % (blockid)) contentnodes = handler.findInSub(TBNodes[0], ".//String") thestr = "" for c in contentnodes: if "SUBS_TYPE" in c.attrib and c.attrib["SUBS_TYPE"] == "HypPart1": thestr += c.attrib["CONTENT"] else: thestr += c.attrib["CONTENT"] thestr += " " #print(thestr.rstrip().lstrip()) return thestr
def __init__(self): self.database = database() self.config = config() self.port = self.config.getLocalMonitorPort() self.xmlHandler = xmlHandler() self.startServer()
def FindAllTextBlocksInBook(self, filen, urn): blockList = [] self.handler = xmlHandler(inputXmlFile=filen, rootNodeName="alto") blockResult = self.handler.findAllNodes( "Layout/Page/PrintSpace/TextBlock") for block in blockResult: blockList.append(block.attrib['ID']) self.findBookTextInTextBlocks(urn, blockList)
def processNewspaper(self, paperDir, outputDir): NSMETS = 'http://www.loc.gov/METS/' self.paperDir = paperDir self.errorLogged = False for file in glob.glob(self.paperDir + "/*.xml"): if "mets.xml" not in file: self.filelist.append(file.split("/")[-1]) else: self.altopapermetsfile = file self.filelist.sort() #print(self.filelist) #print(self.filelist[1]) files = glob.glob(paperDir + "/*mets.xml") file = files[0] #for file in glob.glob(paperDir +"/*mets.xml"): #print(file) relativefilename = file.split("/")[-1] self.currentUrn = relativefilename.split( "_")[0] + "_" + relativefilename.split("_")[3] self.altopapermetsfile = file self.outputPath = outputDir #print("mets file: " + self.altopapermetsfile) yearmonthday = relativefilename.split("_")[3] year = yearmonthday[0:4] self.year = year self.month = yearmonthday[4:6] self.day = yearmonthday[6:8] self.papername = relativefilename.split("_")[0] self.outputPath = outputDir + "/" + str(self.year) + "/" + str( self.month) + "/" + str(self.day) + "/" + str(self.papername) if not os.path.exists(self.outputPath): os.makedirs(self.outputPath) self.metshandler = xmlHandler(inputXmlFile=file, rootNodeName="alto") metsHdr = self.metshandler.findAllNodes("{%s}metsHdr" % (NSMETS)) dateonly = metsHdr[0].attrib["CREATEDATE"].split("T") self.scandate = dateonly[0].split("-")[2] + dateonly[0].split( "-")[1] + dateonly[0].split("-")[0] #print(self.scandate) r = self.metshandler.getRootNode() #self.metshandler.printElement(r) #NSXLINK = "http://www.loc.gov/METS/ //Produksjon8.nb.no/docWORKS/schema/mets-metae.xsd" #attribKey = "{%s}xlink" % (NSXLINK) # print(r.attrib[attribKey]) #print("before") self.buildArticleReferences(file) #print("after") self.writeArticles() self.writeMasterMetafile()
def findWCForBlockId(self, blockid, filename): handler = None if filename == self.xmlfileinuse: handler = self.xmlfilehandler else: handler = xmlHandler(inputXmlFile=filename, rootNodeName="alto") self.xmlfileinuse = filename self.xmlfilehandler = handler if self.docworksVersion == "": docworksVersionNode = handler.findAllNodes( "Description/OCRProcessing/preProcessingStep/processingSoftware/softwareVersion" ) self.docworksVersion = docworksVersionNode[0].text if self.abbyyVersion == "": abbyyVersionNode = handler.findAllNodes( "Description/OCRProcessing/ocrProcessingStep/processingSoftware/softwareVersion" ) if abbyyVersionNode != []: self.abbyyVersion = abbyyVersionNode[0].text else: self.abbyyVersion = "none" blockid = blockid.replace(';', '') # print(blockid, filename) TBNodes = handler.findAllNodes( "Layout/Page/PrintSpace/TextBlock[@ID='%s']" % (blockid)) contentnodes = handler.findInSub(TBNodes[0], ".//String") thestr = "" localConfidence = 0 cnt = 0 cnt98 = 0 for c in contentnodes: if "SUBS_TYPE" in c.attrib and c.attrib["SUBS_TYPE"] == "HypPart1": localConfidence += float(c.attrib["WC"]) cnt += 1 if localConfidence >= 0.98: cnt98 += 1 else: localConfidence += float(c.attrib["WC"]) cnt += 1 if localConfidence >= 0.98: cnt98 += 1 if cnt == 0: return 0, 0 else: return localConfidence / cnt, cnt98 / cnt
def write_explicitContent_from_result_xml(self, result, outputFilename): likely_string = ("Usannsynlig", "Meget Usannsynlig", "Usannsynlig", "Mulig", "Sannsynlig", "Høyst sannsynlig") # first result is retrieved because a single video was processed xmlHdl = xmlHandler(rootNodeName="Snusk") root = xmlHdl.getRootNode() for frame in result.annotation_results[0].explicit_annotation.frames: frame_time = frame.time_offset.seconds + frame.time_offset.nanos / 1e9 attrib = {'Bilde_tid': str(round(frame_time, 2))} xmlHdl.addSubElement( root, "Bilde", attr=attrib, text=format(likely_string[frame.pornography_likelihood])) xmlHdl.prettyPrint(outputFilename)
def write_shots_from_result_xml(self, result, outputFilename): # first result is retrieved because a single video was processed xmlHdl = xmlHandler(rootNodeName="Scener") root = xmlHdl.getRootNode() for i, shot in enumerate( result.annotation_results[0].shot_annotations): start_time = (shot.start_time_offset.seconds + shot.start_time_offset.nanos / 1e9) end_time = (shot.end_time_offset.seconds + shot.end_time_offset.nanos / 1e9) attrib = { 'Start_tid': str(round(start_time, 2)), 'Stopp_tid': str(round(end_time, 2)) } # print(word_info) xmlHdl.addSubElement(root, "Shot", attr=attrib, text=format(str(i))) # handlerTC.prettyPrintToScreen() xmlHdl.prettyPrint(outputFilename)
def FindAllTextBlocksInBook(self, filen): blockList = [] self.handler = xmlHandler(inputXmlFile=filen, rootNodeName="alto") if self.docworksVersion == "": docworksVersionNode = self.handler.findAllNodes( "Description/OCRProcessing/preProcessingStep/processingSoftware/softwareVersion" ) self.docworksVersion = docworksVersionNode[0].text if self.abbyyVersion == "": abbyyVersionNode = self.handler.findAllNodes( "Description/OCRProcessing/ocrProcessingStep/processingSoftware/softwareVersion" ) if abbyyVersionNode != []: self.abbyyVersion = abbyyVersionNode[0].text else: self.abbyyVersion = "none" #print (self.abbyyVersion + " " + self.docworksVersion) #self.handler.printElement(self.docworksVersion) #findInSub(self, node, match): MasterNodes = self.handler.findAllNodes("Layout/Page/PrintSpace") for masterNode in MasterNodes: blockResult = self.handler.findInSub(masterNode, "TextBlock") for block in blockResult: self.findTextInBlock(block)
def write_labels_from_result_xml(self, result, outputFilename): xmlHdl = xmlHandler(rootNodeName="Labels") root = xmlHdl.getRootNode() # xmlHdl.prettyPrintToScreen() # Process video/segment level label annotations segment_labels = result.annotation_results[0].segment_label_annotations for i, segment_label in enumerate(segment_labels): #print(format(segment_label.entity.description)) labelNode = xmlHdl.makeElement( "Label", str(segment_label.entity.description)) xmlHdl.addNode(labelNode) for category_entity in segment_label.category_entities: catNode = xmlHdl.makeElement("Kategori", str(category_entity.description)) xmlHdl.addSubNode(labelNode, catNode) #print (str(category_entity.description)) for i, segment in enumerate(segment_label.segments): start_time = (segment.segment.start_time_offset.seconds + segment.segment.start_time_offset.nanos / 1e9) end_time = (segment.segment.end_time_offset.seconds + segment.segment.end_time_offset.nanos / 1e9) positions = '{}s to {}s'.format(start_time, end_time) confidence = segment.confidence segmentNode = xmlHdl.makeElement("Segment", str(i) + " " + str(positions)) #print(str(i) + " " + str(positions)) #print(confidence) confidenceNode = xmlHdl.makeElement("Konfidens", str(confidence)) xmlHdl.addSubNode(labelNode, segmentNode) xmlHdl.addSubNode(labelNode, confidenceNode) # xmlHdl.prettyPrintToScreen() # Process shot level label annotations shot_labels = result.annotation_results[0].shot_label_annotations #xmlHdl2 = xmlHandler(rootNodeName="Shots") ShotLabelAnnotNodes = xmlHdl.makeElement("shotAnnotations", "shotannotations") xmlHdl.addNode(ShotLabelAnnotNodes) for i, shot_label in enumerate(shot_labels): ShotLabelAnnotNode = xmlHdl.makeElement( "shotLabel", str(shot_label.entity.description)) xmlHdl.addSubNode(ShotLabelAnnotNodes, ShotLabelAnnotNode) for category_entity in shot_label.category_entities: ShotLabelAnnotNodeCategory = xmlHdl.makeElement( "shotLabelCategory", str(category_entity.description)) xmlHdl.addSubNode(ShotLabelAnnotNode, ShotLabelAnnotNodeCategory) for i, shot in enumerate(shot_label.segments): start_time = (shot.segment.start_time_offset.seconds + shot.segment.start_time_offset.nanos / 1e9) end_time = (shot.segment.end_time_offset.seconds + shot.segment.end_time_offset.nanos / 1e9) positions = '{}s to {}s'.format(start_time, end_time) confidence = shot.confidence segmentNodeShot = xmlHdl.makeElement( "Segment", str(str(i) + " " + positions)) confidenceNodeShot = xmlHdl.makeElement( "Konfidens", str(confidence)) xmlHdl.addSubNode(ShotLabelAnnotNodeCategory, segmentNodeShot) xmlHdl.addSubNode(ShotLabelAnnotNodeCategory, confidenceNodeShot) # Process frame level label annotations frame_labels = result.annotation_results[0].frame_label_annotations #xmlHdl3 = xmlHandler(rootNodeName="Frames") FrameLabelAnnotNodes = xmlHdl.makeElement("frameAnnotations", "frameannotations") xmlHdl.addNode(FrameLabelAnnotNodes) for i, frame_label in enumerate(frame_labels): FrameLabelAnnotNode = xmlHdl.makeElement( "frameLabel", str(frame_label.entity.description)) xmlHdl.addSubNode(FrameLabelAnnotNodes, FrameLabelAnnotNode) for category_entity in frame_label.category_entities: CategoryLabelAnnotNode = xmlHdl.makeElement( "categoryLabel", str(category_entity.description)) xmlHdl.addSubNode(FrameLabelAnnotNode, CategoryLabelAnnotNode) # Each frame_label_annotation has many frames, # here we print information only about the first frame. frame = frame_label.frames[0] time_offset = (frame.time_offset.seconds + frame.time_offset.nanos / 1e9) timeOffsetNode = xmlHdl.makeElement("FirstFrameTimeOffset", str(time_offset)) frameOffsetNode = xmlHdl.makeElement("FrameOffsetConfidence", str(frame.confidence)) xmlHdl.addSubNode(CategoryLabelAnnotNode, timeOffsetNode) xmlHdl.addSubNode(CategoryLabelAnnotNode, frameOffsetNode) #xmlHdl.printTreeToFile(outputFilename) #xmlHdl.prettyPrintToScreen() xmlHdl.prettyPrint(outputFilename)
def sync(self): s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) c = config() s.connect(( c.getRemoteMonitorServer() , int(c.getRemoteMonitorPort()))) d = database() self.xml = xmlHandler() #return try: #authenticate the client s.send("auth " + c.getRemoteMonitorUser() + " " + c.getRemoteMonitorPassword() ) print s.recv(1024) #announce our hostname s.send("host " + c.getHostname() ) print s.recv(1024) #check if all dataContainer are enlisted at the server containerList = d.getDataContainer("") for container in containerList: #ask server if container is already known to the server container.updateChecksum() request = "checkDataID " + str(container.dataID) + " " + container.checksum s.send(request) reply = str( s.recv(1024) ) print reply if reply == "dataID unknown": data = self.xml.dataContainerToXml( c.getHostname(), container ) print "trying to send data: " + data print "Send data, #bytes: " + str( s.sendall("data " + str(len(data)) + " " + data )) print "Answer to data:" + s.recv(1024) s.send("commit") print "Answer to commit: " + s.recv(1024) s.send("getLastID " + c.getHostname()) lastId = int( s.recv(1024) ) print "received lastId" + str(lastId) logs = d.getLogs( lastId) for i in range(0, len(logs)): data = self.xml.logEntryToXml( c.getHostname(), logs[i] ) origSize = len(data) dataPad = len(data) % 1024 dataPad = 1024 - dataPad data = data.ljust( origSize + dataPad, " ") print "Size of data: " + str(len(data)) print "Send data, #bytes: " + str( s.sendall("data " + str(len(data)) + " " + data )) print "Answer to data:" + s.recv(1024) s.send("commit") print "Answer to commit: " + s.recv(1024) s.send("exit") print s.recv(1024) finally: s.close()