def insertParses(input, parsePath, output=None, parseName="McCC", extraAttributes={}): import tarfile from SentenceSplitter import openFile """ Divide text in the "text" attributes of document and section elements into sentence elements. These sentence elements are inserted into their respective parent elements. """ print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() print >> sys.stderr, "Inserting parses from", parsePath if parsePath.find(".tar.gz") != -1: tarFilePath, parsePath = parsePath.split(".tar.gz") tarFilePath += ".tar.gz" tarFile = tarfile.open(tarFilePath) if parsePath[0] == "/": parsePath = parsePath[1:] else: tarFile = None docCount = 0 failCount = 0 sentenceCount = 0 docsWithStanford = 0 sentencesCreated = 0 sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")] counter = ProgressCounter(len(sourceElements), "McCC Parse Insertion") for document in sourceElements: docCount += 1 docId = document.get("id") if docId == None: docId = "CORPUS.d" + str(docCount) f = openFile(os.path.join(parsePath, document.get("pmid") + ".sd"), tarFile) if f == None: # file with BioNLP'11 extension not found, try BioNLP'09 extension f = openFile(os.path.join(parsePath, document.get("pmid") + ".dep"), tarFile) if f != None: sentences = document.findall("sentence") # TODO: Following for-loop is the same as when used with a real parser, and should # be moved to its own function. for sentence in sentences: sentenceCount += 1 counter.update(0, "Processing Documents ("+sentence.get("id")+"/" + document.get("pmid") + "): ") if not insertParse(sentence, f, parseName, extraAttributes={}): failCount += 1 f.close() counter.update(1, "Processing Documents ("+document.get("id")+"/" + document.get("pmid") + "): ") if tarFile != None: tarFile.close() #print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" #print >> sys.stderr, docsWithSentences, "/", docCount, "documents have stanford parses" print >> sys.stderr, "Stanford conversion was inserted to", sentenceCount, "sentences,", failCount, "failed" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def insertParses(input, parsePath, output=None, parseName="McCC", tokenizationName = None, makePhraseElements=True, extraAttributes={}): import tarfile from SentenceSplitter import openFile """ Divide text in the "text" attributes of document and section elements into sentence elements. These sentence elements are inserted into their respective parent elements. """ print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() print >> sys.stderr, "Inserting parses from", parsePath assert os.path.exists(parsePath) if parsePath.find(".tar.gz") != -1: tarFilePath, parsePath = parsePath.split(".tar.gz") tarFilePath += ".tar.gz" tarFile = tarfile.open(tarFilePath) if parsePath[0] == "/": parsePath = parsePath[1:] else: tarFile = None docCount = 0 failCount = 0 docsWithSentences = 0 numCorpusSentences = 0 sentencesCreated = 0 sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")] counter = ProgressCounter(len(sourceElements), "McCC Parse Insertion") for document in sourceElements: docCount += 1 origId = document.get("pmid") if origId == None: origId = document.get("origId") if origId == None: origId = document.get("id") origId = str(origId) counter.update(1, "Processing Documents ("+document.get("id")+"/" + origId + "): ") docId = document.get("id") if docId == None: docId = "CORPUS.d" + str(docCount) f = openFile(os.path.join(parsePath, origId + ".ptb"), tarFile) if f == None: # file with BioNLP'11 extension not found, try BioNLP'09 extension f = openFile(os.path.join(parsePath, origId + ".pstree"), tarFile) if f == None: # no parse found continue parseStrings = f.readlines() f.close() sentences = document.findall("sentence") numCorpusSentences += len(sentences) assert len(sentences) == len(parseStrings) # TODO: Following for-loop is the same as when used with a real parser, and should # be moved to its own function. for sentence, treeLine in zip(sentences, parseStrings): if not insertParse(sentence, treeLine, makePhraseElements=makePhraseElements, extraAttributes=extraAttributes, docId=origId): failCount += 1 if tarFile != None: tarFile.close() #print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" #print >> sys.stderr, docsWithSentences, "/", docCount, "documents have sentences" print >> sys.stderr, "Inserted parses for", numCorpusSentences, "sentences (" + str(failCount) + " failed)" if failCount == 0: print >> sys.stderr, "All sentences have a parse" else: print >> sys.stderr, "Warning, a failed parse exists for", failCount, "out of", numCorpusSentences, "sentences" print >> sys.stderr, "The \"pennstring\" attribute of these sentences has an empty string." if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def insertParses(input, parsePath, output=None, parseName="McCC", tokenizationName=None, makePhraseElements=True, extraAttributes={}): import tarfile from SentenceSplitter import openFile """ Divide text in the "text" attributes of document and section elements into sentence elements. These sentence elements are inserted into their respective parent elements. """ print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() print >> sys.stderr, "Inserting parses from", parsePath if parsePath.find(".tar.gz") != -1: tarFilePath, parsePath = parsePath.split(".tar.gz") tarFilePath += ".tar.gz" tarFile = tarfile.open(tarFilePath) if parsePath[0] == "/": parsePath = parsePath[1:] else: tarFile = None docCount = 0 failCount = 0 docsWithSentences = 0 numCorpusSentences = 0 sentencesCreated = 0 sourceElements = [x for x in corpusRoot.getiterator("document") ] + [x for x in corpusRoot.getiterator("section")] counter = ProgressCounter(len(sourceElements), "McCC Parse Insertion") for document in sourceElements: docCount += 1 counter.update( 1, "Processing Documents (" + document.get("id") + "/" + document.get("pmid") + "): ") docId = document.get("id") if docId == None: docId = "CORPUS.d" + str(docCount) f = openFile(os.path.join(parsePath, document.get("pmid") + ".ptb"), tarFile) if f == None: # file with BioNLP'11 extension not found, try BioNLP'09 extension f = openFile( os.path.join(parsePath, document.get("pmid") + ".pstree"), tarFile) if f == None: # no parse found continue parseStrings = f.readlines() f.close() sentences = document.findall("sentence") numCorpusSentences += len(sentences) assert len(sentences) == len(parseStrings) # TODO: Following for-loop is the same as when used with a real parser, and should # be moved to its own function. for sentence, treeLine in zip(sentences, parseStrings): if not insertParse(sentence, treeLine, makePhraseElements=makePhraseElements, extraAttributes=extraAttributes): failCount += 1 if tarFile != None: tarFile.close() #print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" #print >> sys.stderr, docsWithSentences, "/", docCount, "documents have sentences" print >> sys.stderr, "Inserted parses for", numCorpusSentences, "sentences (" + str( failCount) + " failed)" if failCount == 0: print >> sys.stderr, "All sentences have a parse" else: print >> sys.stderr, "Warning, a failed parse exists for", failCount, "out of", numCorpusSentences, "sentences" print >> sys.stderr, "The \"pennstring\" attribute of these sentences has an empty string." if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def insertParses(input, parsePath, output=None, parseName="McCC", extraAttributes={}, skipExtra=0): import tarfile from SentenceSplitter import openFile """ Divide text in the "text" attributes of document and section elements into sentence elements. These sentence elements are inserted into their respective parent elements. """ print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() print >> sys.stderr, "Inserting parses from", parsePath assert os.path.exists(parsePath) if parsePath.find(".tar.gz") != -1: tarFilePath, parsePath = parsePath.split(".tar.gz") tarFilePath += ".tar.gz" tarFile = tarfile.open(tarFilePath) if parsePath[0] == "/": parsePath = parsePath[1:] else: tarFile = None docCount = 0 failCount = 0 sentenceCount = 0 docsWithStanford = 0 sentencesCreated = 0 sourceElements = [x for x in corpusRoot.getiterator("document") ] + [x for x in corpusRoot.getiterator("section")] counter = ProgressCounter(len(sourceElements), "McCC Parse Insertion") for document in sourceElements: docCount += 1 docId = document.get("id") origId = document.get("pmid") if origId == None: origId = document.get("origId") if origId == None: origId = document.get("id") origId = str(origId) if docId == None: docId = "CORPUS.d" + str(docCount) f = openFile(os.path.join(parsePath, origId + ".sd"), tarFile) if f == None: # file with BioNLP'11 extension not found, try BioNLP'09 extension f = openFile(os.path.join(parsePath, origId + ".dep"), tarFile) if f == None: # file with BioNLP'09 extension not found, try BioNLP'13 extension f = openFile(os.path.join(parsePath, origId + ".sdepcc"), tarFile) if f == None: # file with BioNLP'09 extension not found, try BioNLP'13 extension f = openFile(os.path.join(parsePath, origId + ".sdep"), tarFile) if f != None: sentences = document.findall("sentence") # TODO: Following for-loop is the same as when used with a real parser, and should # be moved to its own function. for sentence in sentences: sentenceCount += 1 counter.update( 0, "Processing Documents (" + sentence.get("id") + "/" + origId + "): ") if not insertParse(sentence, f, parseName, extraAttributes={}, skipExtra=skipExtra): failCount += 1 f.close() counter.update( 1, "Processing Documents (" + document.get("id") + "/" + origId + "): ") if tarFile != None: tarFile.close() #print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" #print >> sys.stderr, docsWithSentences, "/", docCount, "documents have stanford parses" print >> sys.stderr, "Stanford conversion was inserted to", sentenceCount, "sentences" #, failCount, "failed" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree