def writeUTF8(rootElement,out): indent(rootElement) if isinstance(out,str): f=open(out,"wt") print >> f, '<?xml version="1.0" encoding="UTF-8"?>' ET.ElementTree(rootElement).write(f,"utf-8") f.close() else: print >> out, '<?xml version="1.0" encoding="UTF-8"?>' ET.ElementTree(rootElement).write(out,"utf-8")
def processLines(lines, setName, usedIds, directed=True, negatives="INCLUDE", tree=None, corpusId="SE10T8"): if tree == None: corpus = ET.Element("corpus", {"source": corpusId}) tree = ET.ElementTree(corpus) else: corpus = tree.getroot() sentence = None for line in lines: line = line.strip() if sentence == None: assert line[0].isdigit(), line origId, line = line.split("\t") sentence = Sentence(origId, line.strip().strip("\""), corpusId, usedIds, setName) else: if line.startswith("Comment:"): sentence.comment = line.split(":", 1)[-1].strip() elif line != "": sentence.relation = line else: assert sentence != None corpus.append( sentence.process(directed=directed, negatives=negatives)) sentence = None return tree
def initfromfile(self): filename = botslib.abspathdata(self.ta_info['filename']) self.ta_info['attributemarker'] = '__' parser = ET.XMLParser() etree = ET.ElementTree() #ElementTree: lexes, parses, makes etree; etree is quite similar to bots-node trees but conversion is needed etreeroot = etree.parse(filename, parser) self.root = self._etree2botstree(etreeroot) #convert etree to bots-nodes-tree
def _test(): msfile = os.path.join(os.path.abspath(os.curdir), 'modulesets', moduleset) tree = ElementTree.ElementTree(file=msfile) elem = tree.getroot() handlers = Handlers(msfile) handlers.handle(elem, None) def filefunc(package): # return a relative path to the bitbake .bb which will be written src_uri = package.getVar('SRC_URI', 1) filename = package.getVar('PN', 1) + '.bb' if not src_uri: return filename else: substr = src_uri[src_uri.find('xorg/'):] subdirlist = substr.split('/')[:2] subdir = '-'.join(subdirlist) return os.path.join(subdir, filename) emitter = Emitter(filefunc) for package in handlers.packages: template = emitter.filefunc(package) + '.in' if os.path.exists(template): print("%s exists, emitting based on template" % template) emitter.write(package, template) else: print("%s does not exist, emitting non-templated" % template) emitter.write(package)
def serialize(elem, encoding=None): import StringIO file = StringIO.StringIO() tree = ElementTree.ElementTree(elem) if encoding: tree.write(file, encoding) else: tree.write(file) return file.getvalue()
def envelopewrite(self,node): ''' write envelope for XML messages''' self._initwrite() self.normalisetree(node) xmltree = ET.ElementTree(self._node2xml(node)) root = xmltree.getroot() ETI.include(root) self._xmlcorewrite(xmltree,root) self._closewrite()
def saveKeys(): if not _KEYFILE: raise Exception, 'No _KEYFILE' tree = ElementTree.ElementTree(ElementTree.Element('Keys')) for x in _KEYS: e = ElementTree.Element('key') e.text = x tree.getroot().append(e) tree.write(_KEYFILE)
def write(rootElement, filename): if isinstance(rootElement, ElementTree.ElementTree): rootElement = rootElement.getroot() indent(rootElement) if filename.endswith(".gz"): out = GzipFile(filename, "wt") else: out = open(filename, "wt") print >> out, '<?xml version="1.0" encoding="UTF-8"?>' ElementTree.ElementTree(rootElement).write(out, "utf-8") out.close()
def include(self, element, parent): href = element.attrib.get('href') fullhref = os.path.join(self.msdirname, href) tree = ElementTree.ElementTree(file=fullhref) elem = tree.getroot() # Append the children of the newly included root element to the parent # element, and manually handle() them, as the currently running # iteration isn't going to hit them. for child in elem: self.handle(child, elem) parent.append(elem)
def make_keyfiles(): x = ElementTree.Element('Keys') y = ElementTree.Element('key') y["name"] = KEY[0] y.text = KEY[1] x.append(y) y = ElementTree.Element('key') y.text = 'foo' x.append(y) b = ElementTree.ElementTree(x) b.write('tests/xml/keys') b.write('tests/xml/keys3')
def _write(self, node): ''' write normal XML messages (no envelope)''' xmltree = ET.ElementTree(self._node2xml(node)) f = botslib.opendata(self.ta_info['filename'], "wb") if self.ta_info['indented']: indentstring = '\n' else: indentstring = '' #xml prolog: always use.********************************* #syntax parameter controls if stand-alone is used within prolog. #in ET 1.3.0: if standalone is to be used: should surpress ET-generated prolog - explicit parameter #in ET 1.2.6: always generates prolog if encoding != utf-8/ascii. SO: can not use stadnalone for encoding !=utf-8,ascii if ET.VERSION not in ['1.2.6', '1.0.6'] or self.ta_info['charset'] in [ 'us-ascii', 'utf-8' ]: if self.ta_info['standalone']: standalonestring = 'standalone="%s" ' % ( self.ta_info['standalone']) else: standalonestring = '' PI = ET.ProcessingInstruction( 'xml', 'version="%s" encoding="%s" %s' % (self.ta_info['version'], self.ta_info['charset'], standalonestring)) f.write( ET.tostring(PI) + indentstring ) #do not use encoding here. gives double xml prolog; possibly because ET.ElementTree.write i used again by write() #doctype /DTD ************************************** if self.ta_info['DOCTYPE']: f.write('<!DOCTYPE %s>' % (self.ta_info['DOCTYPE']) + indentstring) #processing instructions (other than prolog) ************ if self.ta_info['processing_instructions']: for pi in self.ta_info['processing_instructions']: PI = ET.ProcessingInstruction(pi[0], pi[1]) f.write( ET.tostring(PI) + indentstring ) #do not use encoding here. gives double xml prolog; possibly because ET.ElementTree.write i used again by write() #indent the xml elements if self.ta_info['indented']: root = xmltree.getroot() self.botsindent(root) if ET.VERSION <= '1.2.6': xmltree.write(f, encoding=self.ta_info['charset']) else: xmltree.write(f, encoding=self.ta_info['charset'], xml_declaration=False)
def init(label, fileoutput): conarypk = ConaryPk() cli = conarypk.cli cfg = conarypk.cfg log.info("Attempting to retrieve repository data for %s" % label) try: pkgs = getPackagesFromLabel(cfg, cli, label) troves = conarypk.repos.getTroves(pkgs, withFiles=False) nodes = generate_xml(troves, label) cElementTree.ElementTree(nodes).write(fileoutput) log.info("Successfully wrote XML data for label %s into file %s" % (label, fileoutput)) except: log.error("Failed to gather data from the repository")
def write(rootElement, filename): if isinstance(rootElement, ElementTree.ElementTree): rootElement = rootElement.getroot() indent(rootElement) # Create intermediate paths if needed if os.path.dirname(filename) != "" and not os.path.exists( os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) # Open the output file if filename.endswith(".gz"): out = GzipFile(filename, "wt") else: out = open(filename, "wt") print >> out, '<?xml version="1.0" encoding="UTF-8"?>' ElementTree.ElementTree(rootElement).write(out, "utf-8") out.close() # Fix newlines inside attributes encodeNewlines(filename)
def loadKeys(fname=None, force_=False): global _KEYS, _KEYFILE if not fname: fname = os.path.expanduser('~/.isbndbkeys') if not os.path.exists(fname): a = ElementTree.Element('Keys') ElementTree.ElementTree(a).write(fname) _KEYFILE = fname _KEYS = dict() tree = ElementTree.parse(fname) for x in tree.findall('key'): try: _KEYS[x.get("name")] = Key(x.text, x.get("name")) except: if force_: _KEYS[x.get("name")] = Key(x.text, x.get("name"), force_=True) else: pass
def convert(srFiles, xmlFileName, outdir, corpusName, idByNorText=False): print >> sys.stderr, "Loading Static Relations" events = {} for srFile in srFiles: readEventsFromSR(srFile[0], srFile[1], events, idByNorText=idByNorText) if xmlFileName != None: xmlEvents = {} dataSets = {} srTexts = {} # original, unnormalized sentence texts from the SR corpus eventsToXML(events, xmlEvents, dataSets, srTexts) print >> sys.stderr, "Loading XML" xml = ETUtils.ETFromObj(xmlFileName) print >> sys.stderr, "Inserting XML events" insertEvents(xmlEvents, dataSets, srTexts, xml, corpusName) ETUtils.write(xml, outdir+corpusName+"-srevents.xml") # update pre-existing parses print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(xml, "split-mccc-preparsed", tokenization=None, output=outdir+corpusName+"-heads.xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(xml, outDir, corpusName + "-", ".xml", [("devel", "train")]) print >> sys.stderr, "Converting back" STConvert.toSTFormat(outdir+corpusName + "-devel.xml", outDir + corpusName + "-stformat-devel", outputTag="rel", task=2, debug=True, validate=False) STConvert.toSTFormat(outdir+corpusName + "-train.xml", outDir + corpusName + "-stformat-train", outputTag="rel", task=2, debug=True, validate=False) else: xml = eventsToNewXML(events) xmlTree = ET.ElementTree(xml) ETUtils.write(xml, outdir+corpusName+"-srevents.xml") xml = xmlTree # Parse bigfileName = outdir+corpusName print >> sys.stderr, "Parsing" Tools.CharniakJohnsonParser.parse(xml, bigfileName+"-parsed.xml", tokenizationName="PARSED_TEXT", parseName="McClosky", requireEntities=True, timeout=60) print >> sys.stderr, "Stanford Conversion" Tools.StanfordParser.convertXML("McClosky", xml, bigfileName+"-stanford.xml") print >> sys.stderr, "Protein Name Splitting" splitTarget = "McClosky" xml = ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(xml, "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(xml, outDir, "SRNE-", ".xml")
def convertDDI(outDir, trainUnified=None, trainMTMX=None, testUnified=None, testMTMX=None, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() os.chdir(outDir) logFileName = os.path.join(outDir, "DDI-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "=======================" bigfileName = os.path.join(outDir, "DDI") #oldXML = ETUtils.ETFromObj(bigfileName+".xml") if trainUnified == None: trainUnified = Settings.URL["DDI_TRAIN_UNIFIED"] if trainMTMX == None: trainMTMX = Settings.URL["DDI_TRAIN_MTMX"] if testUnified == None: testUnified = Settings.URL["DDI_TEST_UNIFIED"] if testMTMX == None: testMTMX = Settings.URL["DDI_TEST_MTMX"] tempdir = tempfile.mkdtemp() print >> sys.stderr, "Temporary files directory at", tempdir if True: documents, docById, docCounts = loadDocs(trainUnified, outDir, tempdir) sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k,v): (v,k), reverse=True) datasetCounts = {"train":[0,0], "devel":[0,0], "test":[0,0]} for i in range(0, len(sortedDocCounts)-3, 4): for j in [0,1]: docById[sortedDocCounts[i+j][0]].set("set", "train") datasetCounts["train"][0] += sortedDocCounts[i+j][1][0] datasetCounts["train"][1] += sortedDocCounts[i+j][1][1] docById[sortedDocCounts[i+2][0]].set("set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel") docById[sortedDocCounts[i+3][0]].set("set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test") datasetCounts["train"][0] += sortedDocCounts[i+2][1][0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0] datasetCounts["train"][1] += sortedDocCounts[i+2][1][1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1] datasetCounts["devel"][0] += sortedDocCounts[i+3][1][0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0] datasetCounts["devel"][1] += sortedDocCounts[i+3][1][1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1] for document in documents: # epajaolliset jaa yli if document.get("set") == None: document.set("set", "train") print datasetCounts for key in datasetCounts.keys(): if datasetCounts[key][1] != 0: print key, datasetCounts[key][0] / float(datasetCounts[key][1]) else: print key, datasetCounts[key][0], "/", float(datasetCounts[key][1]) if testUnified != None: testDocuments, testDocById, testDocCounts = loadDocs(testUnified, tempdir) for document in testDocuments: document.set("set", "test") documents = documents + testDocuments xmlTree = ET.ElementTree(ET.Element("corpus")) root = xmlTree.getroot() root.set("source", "DrugDDI") for document in documents: root.append(document) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents-notfixed.xml") xml = xmlTree print >> sys.stderr, "Fixing DDI XML" fixEntities(xml) convertToInteractions(xml) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents.xml") #sys.exit() if False: print >> sys.stderr, "Parsing" Tools.CharniakJohnsonParser.parse(xml, bigfileName+"-parsed.xml", tokenizationName=None, parseName="McClosky", requireEntities=True, timeout=10) print >> sys.stderr, "Stanford Conversion" Tools.StanfordParser.convertXML("McClosky", xml, bigfileName+"-stanford.xml") #if True: #xml = bigfileName + "-stanford.xml" print >> sys.stderr, "Protein Name Splitting" splitTarget = "McClosky" xml = ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(xml, "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(xml, outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")]) #InteractionXML.DivideSets.processCorpus(oldXML, outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")]) #InteractionXML.DivideSets.processCorpus(bigfileName+".xml", outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")]) #if "devel" in [x[0] for x in datasets]: # print >> sys.stderr, "Creating empty devel set" # deletionRules = {"interaction":{},"entity":{"isName":"False"}} # InteractionXML.DeleteElements.processCorpus(corpusName + "-devel.xml", corpusName + "-devel-empty.xml", deletionRules) #return xml Stream.closeLog(logFileName) if not debug: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
def _write(self,node): ''' write normal XML messages (no envelope)''' xmltree = ET.ElementTree(self._node2xml(node)) root = xmltree.getroot() self._xmlcorewrite(xmltree,root)
try: data = data.encode(encoding) except UnicodeDecodeError: pass # Guess from who we're getting this? data = data.replace('\x00', '') try: parser.feed(data) except Exception, error: if dump_invalid_data: print error, repr(data) parser.close() raise else: return ET.ElementTree(parser.close()) def get_device_xml(self): """ Get the main device descriptor xml file. """ xml = """<root> <specVersion> <major>1</major> <minor>0</minor> </specVersion> <device> <deviceType>urn:schemas-upnp-org:device:MediaRenderer:1</deviceType> <friendlyName>{friendly_name}</friendlyName> <manufacturer>{manufacturer}</manufacturer> <manufacturerURL>{manufacturer_url}</manufacturerURL>
def convertDDI(outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI11-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "=======================" corpusDir = outDir + "/DDI11-original" Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir, downloadDir) bigfileName = os.path.join(outDir, "DDI11") #oldXML = ETUtils.ETFromObj(bigfileName+".xml") trainUnified = corpusDir + "/train" trainMTMX = corpusDir + "/train_MTMX" testUnified = corpusDir + "/test" testMTMX = corpusDir + "/test_MTMX" # Load main documents tempdir = tempfile.mkdtemp() print >> sys.stderr, "Temporary files directory at", tempdir documents, docById, docCounts = loadDocs(trainUnified) # Divide training data into a train and devel set sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k, v): (v, k), reverse=True) datasetCounts = {"train": [0, 0], "devel": [0, 0], "test": [0, 0]} for i in range(0, len(sortedDocCounts) - 3, 4): for j in [0, 1]: docById[sortedDocCounts[i + j][0]].set("set", "train") datasetCounts["train"][0] += sortedDocCounts[i + j][1][0] datasetCounts["train"][1] += sortedDocCounts[i + j][1][1] docById[sortedDocCounts[i + 2][0]].set( "set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel") docById[sortedDocCounts[i + 3][0]].set( "set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test") datasetCounts["train"][0] += sortedDocCounts[i + 2][1][ 0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0] datasetCounts["train"][1] += sortedDocCounts[i + 2][1][ 1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1] datasetCounts["devel"][0] += sortedDocCounts[i + 3][1][ 0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0] datasetCounts["devel"][1] += sortedDocCounts[i + 3][1][ 1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1] for document in documents: # epajaolliset jaa yli if document.get("set") == None: document.set("set", "train") # Print division results print >> sys.stderr, datasetCounts for key in datasetCounts.keys(): if datasetCounts[key][1] != 0: print key, datasetCounts[key][0] / float(datasetCounts[key][1]) else: print key, datasetCounts[key][0], "/", float(datasetCounts[key][1]) # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed # for the final evaluation. changeIdCount = 1000 for trainId in [ 'DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334', 'DrugDDI.d337', 'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354', 'DrugDDI.d373', 'DrugDDI.d379', 'DrugDDI.d383', 'DrugDDI.d388', 'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 'DrugDDI.d409', 'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430', 'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452', 'DrugDDI.d462', 'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474', 'DrugDDI.d480', 'DrugDDI.d482', 'DrugDDI.d485', 'DrugDDI.d492', 'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 'DrugDDI.d500', 'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523', 'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552', 'DrugDDI.d554', 'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570', 'DrugDDI.d578' ]: newId = "DrugDDI.d" + str(changeIdCount) print >> sys.stderr, "Changing train/devel id", trainId, "to", newId for element in docById[trainId].getiterator(): for attrName, attrValue in element.attrib.iteritems(): if trainId in attrValue: element.set(attrName, attrValue.replace(trainId, newId)) docById[newId] = docById[trainId] del docById[trainId] changeIdCount += 1 # If test set exists, load it, too if testUnified != None: testDocuments, testDocById, testDocCounts = loadDocs(testUnified) for document in testDocuments: document.set("set", "test") documents = documents + testDocuments overlappingIds = [] for key in docById: if key in testDocById: overlappingIds.append(key) for key in docById: assert key not in testDocById, (key, docById[key].get("origId"), testDocById[key].get("origId"), sorted(docById.keys()), sorted(testDocById.keys()), sorted(overlappingIds)) docById.update(testDocById) # Add all documents into one XML xmlTree = ET.ElementTree(ET.Element("corpus")) root = xmlTree.getroot() root.set("source", "DDI11") for document in documents: root.append(document) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents-notfixed.xml") xml = xmlTree print >> sys.stderr, "Fixing DDI XML" fixEntities(xml) convertToInteractions(xml) # Add MTMX if trainMTMX != None: inDir = Utils.Download.getTopDir( tempdir, Utils.Download.downloadAndExtract(trainMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if testMTMX != None: inDir = Utils.Download.getTopDir( tempdir, Utils.Download.downloadAndExtract(testMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents.xml") print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------" Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"], os.path.join(Settings.DATAPATH, "TEES-parses"), downloadDir, redownload=redownload) extractedFilename = os.path.join(Settings.DATAPATH, "TEES-parses") + "/DDI11" print >> sys.stderr, "Making sentences" Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None) print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses( xml, extractedFilename, None, extraAttributes={"source": "TEES-preparsed"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses( xml, extractedFilename, None, extraAttributes={"stanfordSource": "TEES-preparsed"}) print >> sys.stderr, "Protein Name Splitting" splitTarget = "McCC" #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True) print >> sys.stderr, "Head Detection" #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True) xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True) print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml") Stream.closeLog(logFileName) if not debug: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
def convertChemProt(inDirs=None, setNames=None, outPath=None, goldTestSet=True, downloadDir=None, extractDir=None, redownload=False, debug=False): tempDir = None if inDirs == None: print >> sys.stderr, "---------------", "Downloading ChemProt files", "---------------" if extractDir == None: tempDir = tempfile.mkdtemp() inDirs = [] for setName in ("TRAIN", "DEVEL", "TEST"): if goldTestSet and setName == "TEST": setName = "TEST_GOLD" if Settings.URL["CP17_" + setName] != None: currentExtractDir = extractDir if extractDir else tempDir currentExtractDir = os.path.join(currentExtractDir, setName.lower()) inDirs.append( downloadFile(Settings.URL["CP17_" + setName], downloadDir, currentExtractDir, redownload)) print >> sys.stderr, "Reading ChemProt corpus from input", inDirs, "using dataset mapping", setNames dataSets = OrderedDict() for inDir in inDirs: print >> sys.stderr, "Reading input directory", inDir filenames = os.listdir(inDir) filetypes = ["_abstracts", "_entities", "_relations"] # Collect the file paths for the data types dirDataSets = set() for filename in filenames: if not (filename.endswith(".tsv") and any([x in filename for x in filetypes])): continue dataSetId, dataType = filename.replace("_gs", "").rsplit("_", 1) if setNames != None: dataSetId = setNames.get(dataSetId, dataSetId) dirDataSets.add(dataSetId) dataType = dataType.split(".")[0] if dataSetId not in dataSets: dataSets[dataSetId] = {} assert dataType not in dataSets[dataSetId] dataSets[dataSetId][dataType] = os.path.join(inDir, filename) print >> sys.stderr, "Found ChemProt datasets", list( dirDataSets), "at", inDir print >> sys.stderr, "Read datasets:", dataSets.keys() # Build the Interaction XML print >> sys.stderr, "Converting to Interaction XML" corpusName = "CP17" corpus = ET.Element("corpus", {"source": corpusName}) counts = defaultdict(int) docById = {} entityById = {} entitiesByDoc = {} docsWithErrors = set() for dataSetId in sorted(dataSets.keys()): prevCounts = copy.copy(counts) print >> sys.stderr, "---", "Building elements for dataset", dataSetId, "---" dataSet = dataSets[dataSetId] counts["sets"] += 1 with open(dataSet["abstracts"], "rt") as f: print >> sys.stderr, "Adding document elements for dataset", dataSetId for row in UnicodeDictReader( f, delimiter="\t", fieldnames=["id", "title", "abstract"], quoting=csv.QUOTE_NONE): document = ET.Element( "document", { "id": corpusName + ".d" + str(counts["documents"]), "origId": row["id"], "set": dataSetId }) document.set("text", row["title"] + " " + row["abstract"]) document.set("titleOffset", Range.tuplesToCharOffset((0, len(row["title"])))) if document.get("origId") in docById: assert document.get("text") == docById[document.get( "origId")].get("text") assert document.get("titleOffset") == docById[document.get( "origId")].get("titleOffset") counts["duplicate-documents"] += 1 else: corpus.append(document) docById[document.get("origId")] = document counts["documents"] += 1 with open(dataSet["entities"], "rt") as f: print >> sys.stderr, "Adding entity elements for dataset", dataSetId for row in UnicodeDictReader( f, delimiter="\t", fieldnames=["docId", "id", "type", "begin", "end", "text"], quoting=csv.QUOTE_NONE): document = docById[row["docId"]] assert row["type"] in ("CHEMICAL", "GENE-Y", "GENE-N") # Check for duplicate entities if row["docId"] not in entitiesByDoc: entitiesByDoc[row["docId"]] = set() assert row["id"] not in entitiesByDoc[row["docId"]] entitiesByDoc[row["docId"]].add(row["id"]) # Determine the offset offset = (int(row["begin"]), int(row["end"])) docSpan = document.get("text")[offset[0]:offset[1]] if docSpan == row["text"]: entity = ET.SubElement( document, "entity", { "id": document.get("id") + ".e" + str(len([x for x in document.findall("entity")])) }) entity.set("given", "True") entity.set("origId", row["id"]) entity.set("type", row["type"].split("-")[0]) entity.set( "normalized", "True" if row["type"].endswith("-Y") else "False") entity.set( "charOffset", Range.tuplesToCharOffset((offset[0], offset[1]))) entity.set("text", row["text"]) if row["docId"] not in entityById: entityById[row["docId"]] = {} assert entity.get("origId") not in entityById[row["docId"]] entityById[row["docId"]][entity.get("origId")] = entity counts["entities"] += 1 else: print >> sys.stderr, "Alignment error in document", row[ "docId"], (offset, docSpan, row) counts["entities-error"] += 1 docsWithErrors.add(row["docId"]) if "relations" in dataSet: print >> sys.stderr, "Adding relation elements for dataset", dataSetId with open(dataSet["relations"], "rt") as f: for row in UnicodeDictReader(f, delimiter="\t", fieldnames=[ "docId", "group", "groupEval", "type", "arg1", "arg2" ], quoting=csv.QUOTE_NONE): for argId in ("1", "2"): assert row["arg" + argId].startswith("Arg" + argId + ":") row["arg" + argId] = row["arg" + argId][5:] document = docById[row["docId"]] e1 = entityById[row["docId"]].get(row["arg1"]) e2 = entityById[row["docId"]].get(row["arg2"]) if e1 != None and e2 != None: interaction = ET.SubElement( document, "interaction", { "id": document.get("id") + ".i" + str( len([ x for x in document.findall( "interaction") ])) }) interaction.set("directed", "True") interaction.set("type", row["group"]) interaction.set("relType", row["type"]) row["groupEval"] = row["groupEval"].strip() assert row["groupEval"] in ("Y", "N") interaction.set( "evaluated", "True" if row["groupEval"] == "Y" else "False") interaction.set("e1", e1.get("id")) interaction.set("e2", e2.get("id")) counts["interactions"] += 1 else: counts["interaction-error"] += 1 docsWithErrors.add(row["docId"]) else: print >> sys.stderr, "No relations for dataset", dataSetId print >> sys.stderr, "dataset", dataSetId, { x: counts[x] - prevCounts.get(x, 0) for x in counts if counts[x] - prevCounts.get(x, 0) > 0 } if len(docsWithErrors) > 0: counts["documents-with-errors"] = len(docsWithErrors) print >> sys.stderr, "---", "All Datasets Done", "---" print >> sys.stderr, "ChemProt conversion:", dict(counts) if tempDir != None and not debug: print >> sys.stderr, "Removing temporary directory", tempDir shutil.rmtree(tempDir) if outPath != None: ETUtils.write(corpus, outPath) return ET.ElementTree(corpus)
elem.tail = i def parse_xml(data, encoding="utf-8", dump_invalid_data=False): try: p = ET.XMLParser(encoding=encoding) except exceptions.TypeError: p = ET.XMLParser() # my version of twisted.web returns page_infos as a dictionary in # the second item of the data list if isinstance(data, (list, tuple)): data, _ = data try: data = data.encode(encoding) except UnicodeDecodeError: pass # Guess from who we're getting this? data = data.replace('\x00', '') try: p.feed(data) except Exception, error: if dump_invalid_data: print error, repr(data) p.close() raise Exception, error else: return ET.ElementTree(p.close())
print >> sys.stderr, 'Error: Module', module, 'or package', package, 'not found in CPAN.' sys.exit(1) modules.sort() if package_version: version = package_version location = path #print >>sys.stderr, 'We found package %s with version %s with modules:' % (package, version) ppath = path.split('/') mnemo = ppath[2] ### Find specific author in CPAN authors list tree = ElementTree.ElementTree(file=os.path.join(tmppath, '00whois.xml')) root = tree.getroot() for elem in root.getiterator('{http://www.cpan.org/xmlns/whois}cpanid'): if mnemo == elem.find('{http://www.cpan.org/xmlns/whois}id').text: try: authorel = elem.find('{http://www.cpan.org/xmlns/whois}fullname') emailel = elem.find('{http://www.cpan.org/xmlns/whois}email') email = emailel.text.replace('@', '$').replace('.', ',').replace( ' at ', '$').replace(' dot ', ',').replace(' [at] ', '$').replace('_dot_', ',') author = "%s <%s>" % (authorel.text, email) authors.append(author.encode('utf8', 'replace')) except: pass break