def convertDownloaded(outdir, corpus, files, intermediateFiles=True, evaluate=True): global moveBI if evaluate: workdir = outdir + "/conversion/" + corpus if os.path.exists(workdir): shutil.rmtree(workdir) os.makedirs(workdir) print >> sys.stderr, "---------------", "Converting to XML", "---------------" # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are # applied equally datasets = ["devel", "train", "test"] bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets)) documents = [] for setName in datasets: sourceFile = files[corpus + "_" + setName.upper()] print >> sys.stderr, "Reading", setName, "set from", sourceFile sitesAreArguments = False if corpus == "EPI": sitesAreArguments = True docs = ST.loadSet(sourceFile, setName, "a2", sitesAreArguments=sitesAreArguments) print >> sys.stderr, "Read", len(docs), "documents" documents.extend(docs) if len(docs) > 0 and docs[0].license != None: licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt") licenseFile.write(docs[0].license) licenseFile.close() print >> sys.stderr, "Resolving equivalences" Utils.STFormat.Equiv.process(documents) if evaluate: print >> sys.stderr, "Checking data validity" for doc in documents: Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id) print >> sys.stderr, "Writing all documents to geniaformat" ST.writeSet(documents, os.path.join(workdir, "all-geniaformat"), resultFileTag="a2", debug=False, task=2, validate=False) if intermediateFiles: print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName+"-documents.xml" xml = STConvert.toInteractionXML(documents, corpus, bigfileName+"-documents.xml") else: print >> sys.stderr, "Converting to XML" xml = STConvert.toInteractionXML(documents, corpus, None) if corpus == "BI": Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel") if corpus == "REN": corpusRENtoASCII(xml) addAnalyses(xml, corpus, datasets, files, bigfileName) if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName+"-sentences.xml" ETUtils.write(xml, bigfileName+"-sentences.xml") processParses(xml) print >> sys.stderr, "---------------", "Writing corpora", "---------------" # Write out converted data if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName+".xml" ETUtils.write(xml, bigfileName+".xml") print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml") if evaluate and "devel" in datasets: print >> sys.stderr, "---------------", "Evaluating conversion", "---------------" if corpus != "REL": # Task 1 (removal of Entity-entities) cannot work for REL print >> sys.stderr, "Evaluating task 1 back-conversion" STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task1", outputTag="a2", task=1) BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task1", corpus + ".1") print >> sys.stderr, "Evaluating task 2 back-conversion" STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task2", outputTag="a2", task=2) BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task2", corpus + ".2") print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"
def convertDownloaded(outdir, corpus, files, intermediateFiles=True, evaluate=True, processEquiv=True, addAnalyses=True, packageSubPath=None): global moveBI if evaluate: workdir = outdir + "/conversion/" + corpus if os.path.exists(workdir): shutil.rmtree(workdir) os.makedirs(workdir) print >> sys.stderr, "---------------", "Converting to XML", "---------------" # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are # applied equally #print corpus, files datasets = [] for setName in ["devel", "train", "test"]: if corpus + "_" + setName.upper() in files: datasets.append(setName) bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets)) documents = [] for setName in datasets: sourceFile = files[corpus + "_" + setName.upper()] print >> sys.stderr, "Reading", setName, "set from", sourceFile docs = ST.loadSet(sourceFile, setName, "a2", subPath=packageSubPath) print >> sys.stderr, "Read", len(docs), "documents" documents.extend(docs) if len(docs) > 0 and docs[0].license != None: licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt") licenseFile.write(docs[0].license) licenseFile.close() if processEquiv: print >> sys.stderr, "Resolving equivalences" Utils.STFormat.Equiv.process(documents) else: print >> sys.stderr, "Skipping resolving of equivalences" if evaluate: #print >> sys.stderr, "Checking data validity" #for doc in documents: # Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id) print >> sys.stderr, "Writing all documents to geniaformat" ST.writeSet(documents, os.path.join(workdir, "all-geniaformat"), resultFileTag="a2", debug=False) if intermediateFiles: print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName+"-documents.xml" xml = STConvert.toInteractionXML(documents, corpus, bigfileName+"-documents.xml") else: print >> sys.stderr, "Converting to XML" xml = STConvert.toInteractionXML(documents, corpus, None) if corpus == "BI11": Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel") if corpus == "REN11": corpusRENtoASCII(xml) if addAnalyses: insertAnalyses(xml, corpus, datasets, files, bigfileName, packageSubPath=packageSubPath) else: print >> sys.stderr, "Skipping adding analyses" if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName+"-sentences.xml" ETUtils.write(xml, bigfileName+"-sentences.xml") processParses(xml) # A hack for GRN13 task that breaks the official BioNLP Shared Task convention of trigger and event having the same type. # Let's remove the unused triggers, so that there won't be an unusable node class. There is no clean way to fix this, # as the GRN13 task not following the official rules introduces yet another mechanism into the Shared Task format, # and supporting this would require rewriting everything. if corpus == "GRN13": Utils.InteractionXML.DeleteElements.processCorpus(xml, None, {"entity":{"type":["Action"]}}) print >> sys.stderr, "---------------", "Writing corpora", "---------------" checkAttributes(xml) # Write out converted data if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName+".xml" ETUtils.write(xml, bigfileName+".xml") print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml") if evaluate and "devel" in datasets: print >> sys.stderr, "---------------", "Evaluating conversion", "---------------" if corpus != "REL11": # Task 1 (removal of Entity-entities) cannot work for REL print >> sys.stderr, "Evaluating task 1 back-conversion" STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task1", outputTag="a2", skipArgs=["Site"]) BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task1", corpus + ".1") print >> sys.stderr, "Evaluating task 2 back-conversion" STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task2", outputTag="a2") BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task2", corpus + ".2") print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping" # Check what was produced by the conversion print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------" analyzer = StructureAnalyzer() analyzer.analyze([xml]) print >> sys.stderr, analyzer.toString()
def convertDownloaded(outdir, corpus, files, intermediateFiles=True, evaluate=True, processEquiv=True, analysisMode="INSERT", packageSubPath=None, debug=False): global moveBI if evaluate: workdir = outdir + "/conversion/" + corpus if os.path.exists(workdir): shutil.rmtree(workdir) os.makedirs(workdir) print >> sys.stderr, "---------------", "Converting to XML", "---------------" # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are # applied equally #print corpus, files datasets = [] for setName in ["devel", "train", "test"]: if corpus + "_" + setName.upper() in files: datasets.append(setName) bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets)) documents = [] for setName in datasets: sourceFile = files[corpus + "_" + setName.upper()] print >> sys.stderr, "Reading", setName, "set from", sourceFile docs = ST.loadSet(sourceFile, setName, "a2", subPath=packageSubPath) print >> sys.stderr, "Read", len(docs), "documents" documents.extend(docs) if len(docs) > 0 and docs[0].license != None: licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt") licenseFile.write(docs[0].license) licenseFile.close() if processEquiv: print >> sys.stderr, "Resolving equivalences" Utils.STFormat.Equiv.process(documents) else: print >> sys.stderr, "Skipping resolving of equivalences" if evaluate: #print >> sys.stderr, "Checking data validity" #for doc in documents: # Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id) print >> sys.stderr, "Writing all documents to geniaformat" ST.writeSet(documents, os.path.join(workdir, "all-geniaformat"), resultFileTag="a2", debug=False) if intermediateFiles: print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName + "-documents.xml" xml = STConvert.toInteractionXML(documents, corpus, bigfileName + "-documents.xml") else: print >> sys.stderr, "Converting to XML" xml = STConvert.toInteractionXML(documents, corpus, None) if corpus == "BI11": Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel") if corpus == "REN11": corpusRENtoASCII(xml) if analysisMode == "INSERT": insertAnalyses(xml, corpus, datasets, files, bigfileName, packageSubPath=packageSubPath) if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName + "-sentences.xml" ETUtils.write(xml, bigfileName + "-sentences.xml") processParses(xml) elif analysisMode == "BUILD": parseXML(xml, bigfileName, intermediateFiles, debug, bbResources=(corpus.startswith("BB_"))) else: print >> sys.stderr, "Skipping analyses" # A hack for GRN13 task that breaks the official BioNLP Shared Task convention of trigger and event having the same type. # Let's remove the unused triggers, so that there won't be an unusable node class. There is no clean way to fix this, # as the GRN13 task not following the official rules introduces yet another mechanism into the Shared Task format, # and supporting this would require rewriting everything. if corpus == "GRN13": Utils.InteractionXML.DeleteElements.processCorpus( xml, None, {"entity": { "type": ["Action"] }}) print >> sys.stderr, "---------------", "Writing corpora", "---------------" checkAttributes(xml) # Write out converted data if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName + ".xml" ETUtils.write(xml, bigfileName + ".xml") print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml") if evaluate and "devel" in datasets: print >> sys.stderr, "---------------", "Evaluating conversion", "---------------" if corpus != "REL11": # Task 1 (removal of Entity-entities) cannot work for REL print >> sys.stderr, "Evaluating task 1 back-conversion" STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task1", outputTag="a2", skipArgs=["Site"]) BioNLP11GeniaTools.evaluate( workdir + "/roundtrip/" + corpus + "-devel" + "-task1", corpus + ".1") print >> sys.stderr, "Evaluating task 2 back-conversion" STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task2", outputTag="a2") BioNLP11GeniaTools.evaluate( workdir + "/roundtrip/" + corpus + "-devel" + "-task2", corpus + ".2") print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping" # Check what was produced by the conversion print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------" analyzer = StructureAnalyzer() analyzer.analyze([xml]) print >> sys.stderr, analyzer.toString()
def convertDownloaded(outdir, corpus, files, intermediateFiles=True, evaluate=True, processEquiv=True, addAnalyses=True, packageSubPath=None): global moveBI if evaluate: workdir = outdir + "/conversion/" + corpus if os.path.exists(workdir): shutil.rmtree(workdir) os.makedirs(workdir) print >> sys.stderr, "---------------", "Converting to XML", "---------------" # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are # applied equally #print corpus, files datasets = [] for setName in ["devel", "train", "test"]: if corpus + "_" + setName.upper() in files: datasets.append(setName) bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets)) documents = [] for setName in datasets: sourceFile = files[corpus + "_" + setName.upper()] print >> sys.stderr, "Reading", setName, "set from", sourceFile docs = ST.loadSet(sourceFile, setName, "a2", subPath=packageSubPath) print >> sys.stderr, "Read", len(docs), "documents" documents.extend(docs) if len(docs) > 0 and docs[0].license != None: licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt") licenseFile.write(docs[0].license) licenseFile.close() if processEquiv: print >> sys.stderr, "Resolving equivalences" Utils.STFormat.Equiv.process(documents) else: print >> sys.stderr, "Skipping resolving of equivalences" if evaluate: #print >> sys.stderr, "Checking data validity" #for doc in documents: # Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id) print >> sys.stderr, "Writing all documents to geniaformat" ST.writeSet(documents, os.path.join(workdir, "all-geniaformat"), resultFileTag="a2", debug=False) if intermediateFiles: print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName + "-documents.xml" xml = STConvert.toInteractionXML(documents, corpus, bigfileName + "-documents.xml") else: print >> sys.stderr, "Converting to XML" xml = STConvert.toInteractionXML(documents, corpus, None) if corpus == "BI11": Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel") if corpus == "REN11": corpusRENtoASCII(xml) if addAnalyses: insertAnalyses(xml, corpus, datasets, files, bigfileName, packageSubPath=packageSubPath) else: print >> sys.stderr, "Skipping adding analyses" if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName + "-sentences.xml" ETUtils.write(xml, bigfileName + "-sentences.xml") processParses(xml) print >> sys.stderr, "---------------", "Writing corpora", "---------------" checkAttributes(xml) # Write out converted data if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName + ".xml" ETUtils.write(xml, bigfileName + ".xml") print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml") if evaluate and "devel" in datasets: print >> sys.stderr, "---------------", "Evaluating conversion", "---------------" if corpus != "REL11": # Task 1 (removal of Entity-entities) cannot work for REL print >> sys.stderr, "Evaluating task 1 back-conversion" STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task1", outputTag="a2", skipArgs=["Site"]) BioNLP11GeniaTools.evaluate( workdir + "/roundtrip/" + corpus + "-devel" + "-task1", corpus + ".1") print >> sys.stderr, "Evaluating task 2 back-conversion" STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task2", outputTag="a2") BioNLP11GeniaTools.evaluate( workdir + "/roundtrip/" + corpus + "-devel" + "-task2", corpus + ".2") print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping" # Check what was produced by the conversion print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------" analyzer = StructureAnalyzer() analyzer.analyze([xml]) print >> sys.stderr, analyzer.toString()