def __init__(self, nbest=10, overparsing=10, only_parse=False, stop_words=None): try: from bllipparser import RerankingParser # WARNING if only_parse=False, BllipParser depends on PyStanfordDependencies: pip install PyStanfordDependencies except ImportError: raise ImportError( 'BllipParser not installed, perhaps it is not supported on OS X yet' ) self.parser = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True) # WARNING this can take a long while. Install manually: `python -mbllipparser.ModelFetcher -i GENIA+PubMed` """create a Reranking Parser from BllipParser""" self.parser.set_parser_options(nbest=nbest, overparsing=overparsing) """set parser options""" self.only_parse = only_parse """whether features should be used from the BllipParser""" self.stemmer = LancasterStemmer() """an instance of LancasterStemmer from NLTK""" self.stop_words = stop_words if self.stop_words is None: self.stop_words = stopwords.words('english')
def load_biomodel(self): rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True) for did in self.documents: for sentence in self.documents[did].sentences: sentence_text = [t.text for t in sentence.tokens] #echocall = Popen(["echo", sentence_text] , stdout=PIPE, stderr=PIPE) #nc_params = ["nc", "localhost", "4449"] #echocall.wait() #call = check_output(nc_params , shell=True, stdin=echocall.stdout) #res = call.communicate() #res = netcat("localhost", 4449, sentence_text) #print res.strip() #print res = rrp.parse(sentence_text) if len(res) > 0: print res[0].ptb_parse print sentence.parsetree print #print sentence.bio_parse = str(res[0].ptb_parse) else: print sentence_text print "no parse" sentence.bio_parse = sentence.parsetree print
def parse_reports(data_path, sheet_name, file_path): report_data_file = xlrd.open_workbook(data_path) sheet = report_data_file.sheet_by_name(sheet_name) rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True) sd = StanfordDependencies.get_instance(backend='subprocess') for i in range(910, 3852): finding = sheet.cell(i, 6).value with open(file_path, mode='a') as f: f.write('finding no.' + str(i)) f.write('\n') sent_tokenize_list = sent_tokenize(finding) for j in range(len(sent_tokenize_list)): try: with open(file_path, mode='a') as f: f.write('sentence no.' + str(j)) f.write('\n') sentence = sent_tokenize_list[j] tree = rrp.simple_parse(sentence) dependencies = sd.convert_tree(tree) for token in dependencies: with open(file_path, mode='a') as f: f.write(str(token)) f.write('\n') except: print('error!') with open(file_path, mode='a') as f: f.write('error!!!') f.write('\n')
def __init__(self, rawTextFileName=None, outputXMLFileName=None): """Initializes the Preprocessor and returns it. This includes loading any models that will be used in multiple preprocessing methods (e.g. RerankingParser) Args: rawTextFileName (str): The name of the raw string narrative file outputXMLFileName (str): The name of the BLANK file to contain the intermediate output XML Returns: Preprocessor object """ if rawTextFileName is not None: self.filename = rawTextFileName self.textList = {} #Initialize the XML file (minimizes XML I/O) self.xmlname = outputXMLFileName self.rrp = RerankingParser.fetch_and_load('GENIA+PubMed') self.parseText() #print file else: print "Need a text file!"
def __init__(self, nbest=10, overparsing=10, only_parse=False, stop_words=None): self.parser = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=False) """create a Reranking Parser from BllipParser""" self.parser.set_parser_options(nbest=nbest, overparsing=overparsing) """set parser options""" self.only_parse=only_parse """whether features should be used from the BllipParser""" self.stemmer = LancasterStemmer() """an instance of LancasterStemmer from NLTK""" self.stop_words = stop_words if self.stop_words is None: self.stop_words = stopwords.words('english')
class SentenceParser(): rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=False) @staticmethod def parse(sentence): return Tree.fromstring(SentenceParser.rrp.simple_parse(sentence)) @staticmethod def tree_parse(sentence): return (SentenceParser.all_parses(sentence)[0]).ptb_parse @staticmethod def all_parses(sentence): return SentenceParser.rrp.parse(sentence)
def parse_question(cls, question: str): """ Parses given question into NLP tree :type question str :rtype: bllipparser.RerankingParser.Tree """ if cls.instance is None: logger = logging.getLogger(cls.__name__) logger.info('Loading a parsing model for NLP...') # https://pypi.org/project/bllipparser/ cls.instance = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=False) logger.info('Model loaded') return cls.instance.parse(question)[0].ptb_parse
def __init__(self, tagger, model): """ Performs all necessary preprocessing :param tagger: Path to the Stanford NER Tagger :param model: Path to the model for the NER Tagger """ # check if model for tokenizer exists try: nltk.data.find('punkt.zip') except: nltk.download('punkt') # init NER parser self.nerParser = StanfordNERTagger(tagger, model) # init Charniak parser self.rerankingParser = RerankingParser.fetch_and_load( 'WSJ+Gigaword-v2')
from multiprocessing import pool from bllipparser import RerankingParser import string def multi_phrase_parse(s, rrp): file_in = open('./reverb_out%s' % s) file_out = open('./phrase_out%s' % s) for line in file_in: sep_line = line.split('\t') sentence = sep_line[12] file_out.write(rrp.simple_parse(sentence) + '\n') file_out.close() def multi_phrase(arg): return multi_phrase_parse(*arg) if __name__ == '__main__': p = Pool() parameter = [] parse = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True) suffix = string.lowercase[0:10] suffix = list(suffix) for ele in suffix: parameter.append((ele, parse)) p.map(multi_phrase, parameter)
def __enter__(self): self.bllip = RerankingParser.fetch_and_load(self.model_name, verbose=True) return self
def __init__(self): super(PatternLearner, self).__init__() self.rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True)
from bllipparser import RerankingParser from kasami import TreeScorer from kasami.normalizers import bllip # Loading WSJ-PTB3 treebank into bllip's RerankingParser bllip_rrp = RerankingParser.fetch_and_load('WSJ-PTB3') bllip_parse = lambda s: bllip.normalize_tree(bllip_rrp.parse(s)[0].ptb_parse) tree = bllip_parse("I am a little teapot") print(tree) print(tree.format(depth=1)) for production in tree: print(str(production)) sentences = [ "I am a little teapot", "Here is my handle", "Here is my spout", "When I get all steamed up I just shout tip me over and pour me out", "I am a very special pot", "It is true", "Here is an example of what I can do", "I can turn my handle into a spout", "Tip me over and pour me out" ] teapot_grammar = TreeScorer.from_tree_bank(bllip_parse(s) for s in sentences) teapot_grammar.score(bllip_parse("Here is a little teapot")) teapot_grammar.score(bllip_parse("It is my handle")) teapot_grammar.score(bllip_parse("I am a spout")) teapot_grammar.score(bllip_parse("Your teapot is gay")) teapot_grammar.score(bllip_parse("Your mom's teapot is asldasnldansldal"))
from textblob import TextBlob from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from scipy.spatial.distance import pdist, squareform import sys reload(sys) sys.setdefaultencoding('utf-8') from bllipparser import RerankingParser import StanfordDependencies source_path = '/home/admin6019/Downloads/testsentence' rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=True) nbest_list = rrp.parse('Why does a zebra have stripes and a giraffe has square spots?') #questionParsed=rrp.simple_parse('Why does a zebra have stripes and a giraffe has square spots?') print repr(nbest_list[0]) print nbest_list[0].ptb_parse #parse tree print nbest_list[0].parser_score #parser score print nbest_list[0].reranker_score # reranker score tokens = nbest_list[0].ptb_parse.sd_tokens() for token in tokens: print token for dirpath, dirs, files in os.walk(source_path): for file in files: fname = os.path.join(dirpath, file) print "fname=", fname
from bllipparser import RerankingParser RerankingParser.fetch_and_load('GENIA+PubMed')
def main(transcript): # results = {"0": "1.0", "1": "0.9747", # "2": "0.968", "3": "0.8859", "4": "0.7071"} # print(json.dumps(results)) results = {} sentences = sent_tokenize(transcript) ''' Declaration of constants and functions ''' CONS_SATIRIC = 0 CONS_RELIABLE = 1 rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=False) foo = TripletExtraction() bar = SemanticSimilarityAnalysis() ''' 2 database tables for comparison of input ''' c.execute('SELECT title FROM reliable_news') reliable_news = [tup[0] for tup in c.fetchall()] c.execute('SELECT title FROM satirical_news') satirical_news = [tup[0] for tup in c.fetchall()] t = len(sentences) correct_classifications = 0 for i in range(t): max_similarity = 0 classification = -1 max_sentence = "" inp = sentences[i] ''' generates the tree and gets the SVO of the input sentence ''' tree_inp = Tree(rrp.simple_parse(inp)) svo_inp = foo.getSVO(tree_inp[0]) ''' comparison for satirical and reliable news ''' for title in satirical_news: for subj in svo_inp['subject']: if subj[2] == 0: continue words = [x.lower() for x in sentence_tokenizer.tokenize(title)] if subj[0] in words or singularize(subj[0]) in words: tree_data = Tree(rrp.simple_parse(title)) svo_data = foo.getSVO(tree_data[0]) similarity_score1 = bar.get_similarities(svo_inp, svo_data) ''' object and subject swapped to provde more possible comparisons ''' svo_data['subject'], svo_data['object'] = svo_data[ 'object'], svo_data['subject'] similarity_score2 = bar.get_similarities(svo_inp, svo_data) similarity_score = similarity_score1 if similarity_score1 > similarity_score2 else similarity_score2 if similarity_score > max_similarity: classification = 0 max_similarity = similarity_score max_sentence = title break for title in reliable_news: for sht in satiric_shits: title = title.replace(sht, "") for subj in svo_inp['subject']: if subj[2] == 0: continue words = [x.lower() for x in sentence_tokenizer.tokenize(title)] if subj[0] in words or singularize(subj[0]) in words: tree_data = Tree(rrp.simple_parse(title)) svo_data = foo.getSVO(tree_data[0]) similarity_score1 = bar.get_similarities(svo_inp, svo_data) ''' object and subject swapped to provde more possible comparisons ''' svo_data['subject'], svo_data['object'] = svo_data[ 'object'], svo_data['subject'] similarity_score2 = bar.get_similarities(svo_inp, svo_data) similarity_score = similarity_score1 if similarity_score1 > similarity_score2 else similarity_score2 if similarity_score > max_similarity: classification = 1 max_similarity = similarity_score max_sentence = title break if classification == CONS_RELIABLE: results[str(i)] = str(round(max_similarity, 4)) elif classification == CONS_SATIRIC: results[str(i)] = str(round(-max_similarity, 4)) else: results[str(i)] = "0" print(json.dumps(results))
def __init__(self, model="WSJ-PTB3"): super().__init__(RerankingParser.fetch_and_load(model, verbose=True))
return [] warnings.filterwarnings('ignore') rel_summary_all_doc = np.load("/home/yld8809/all_rel/tp_all_train.npy") raw_doc_folder = "/home/yld8809/all_rel/txt_all_train/" rel_summary_all_doc_test = np.load("/home/yld8809/all_rel/tp_all_test.npy") raw_doc_folder_test = "/home/yld8809/all_rel/txt_all_test/" model = gensim.models.KeyedVectors.load_word2vec_format( "/home/yld8809/semrel/mimic3_pp300.txt", binary=False) model_size = 300 rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True) raw_ind = glob.glob(raw_doc_folder + '/*.txt') raw_ind.sort() raw_ind_test = glob.glob(raw_doc_folder_test + '/*.txt') raw_ind_test.sort() num_doc = len(raw_ind) num_doc_test = len(raw_ind_test) word_embedding_all = np.empty(shape=[0, model_size + 7]) dep_mat_all = np.empty(shape=[0, 0]) de_parse_last = [] last_sent = []
def set_assertions_for_yesno_questions(data): rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True) yesno = data.get_questions_of_type('yesno') for q in tqdm(yesno): q.assertion_pos = q2s(q.question, rrp)
class Preprocessor(object): """IMPORTANT: The list below stores multiple different forms of text, to minimize the amount of computation""" textList = {} _firstInitialization = True filename = '' #rrp = BllipParser.from_unified_model_dir('/home/vsocrates/.local/share/bllipparser/GENIA+PubMed') rrp = RerankingParser.fetch_and_load('GENIA+PubMed') def __init__(self, rawTextFileName, intermediateXMLFileName): """Initializes the Preprocessor and returns it. This includes loading any models that will be used in multiple preprocessing methods (e.g. RerankingParser) Args: rawTextFileName (str): The name of the raw string narrative file intermediateXMLFileName (str): The name of the BLANK file to contain the intermediate output XML Returns: Preprocessor object """ if 'filename' in Preprocessor.textList and Preprocessor.textList[ 'filename'] == rawTextFileName: self.filename = Preprocessor.textList['filename'] self.xmlname = intermediateXMLFileName return Preprocessor.textList = {} if rawTextFileName is not None: self.filename = rawTextFileName self.xmlname = intermediateXMLFileName Preprocessor.textList['filename'] = self.filename self.parseText() else: print "Need a text file!" return def getList(self): return Preprocessor.textList def parseText(self): """Creates the XML object and parses the raw narrative into the ElementTree python object. This method parses paragraphs, sentences, and tokenizes the text. Any additional features that need to be added into the XML file must have their own methods. Args: None Returns: None It does write the parsed text to the file specified in the initializer """ raw = self.rawText() rawOffsetIntermed = raw offsetIter = 0 offsetIterSent = 0 self.tree = ET.ElementTree(ET.Element('StartOutput')) self.root = self.tree.getroot() paraParent = ET.SubElement(self.root, 'Paragraphs') globalIDIndex = 0 """Now we are breaking up by paragraph""" paraSplit = re.compile('\n').split(raw) # Originally, we were using RegEx to remove all the empty space elements in the list, but they are all '', so we are just going to compare directly for that. Use this again if you find that that is no longer the case. # paragraphPattern = re.compile('[^\s*]') # paragraphs = [i for i in paraSplit if not paragraphPattern.match(i)] paragraphs = [i for i in paraSplit if not i is ''] paraParent.set('Count', str(len(paragraphs))) for index, paragraph in enumerate(paragraphs): tempParaElement = ET.Element('Paragraph', attrib={'id': str(index)}) # We aren't currently including the paragraph text in the <Paragraph /> tag # tempParaElement.text = paragraph paraParent.append(tempParaElement) """Now we have to sentence tokenize the text""" paragraph = re.sub( '-', ' ', paragraph ) ## Replace "-" with " " in the sentences, especially useful for extracting age sentList = sent_tokenize(paragraph) sentParent = ET.Element('Sentences') sentParent.set('Count', str(len(sentList))) tempParaElement.append(sentParent) for index, sent in enumerate(sentList): offsetIndexSent = rawOffsetIntermed.find(sent, offsetIterSent) tempSentElement = ET.Element( 'Sentence', attrib={ 'id': str(index), 'offset': str(offsetIndexSent) + ':' + str(offsetIndexSent + len(sent)) }) sentTextElem = ET.Element('Text') sentTextElem.text = sent tempSentElement.append(sentTextElem) sentParent.append(tempSentElement) offsetIterSent = offsetIndexSent """Now we have to break it down by token""" tokensList = word_tokenize(sent) tokenParent = ET.Element('Tokens') tokenParent.set('Count', str(len(tokensList))) tempSentElement.append(tokenParent) for index, word in enumerate(tokensList): offsetIndex = rawOffsetIntermed.find(word, offsetIter) tempWordElement = ET.Element( 'Token', attrib={ 'id': str(index), 'globalID': str(globalIDIndex), 'offset': str(offsetIndex) + ':' + str(offsetIndex + len(word)) }) textElem = ET.Element('Text') textElem.text = word tempWordElement.append(textElem) tokenParent.append(tempWordElement) offsetIter = offsetIndex globalIDIndex += 1 self.writeToXML() def rawText(self): """Returns the raw string (usually only used for RegEx extractors that don't want any preprocessing/XML) Args: None Returns The raw string from the text file (str) """ if Preprocessor.textList.get('rawText') is None: file = open(self.filename) raw = file.read() rawUnicode = raw.decode('utf-8') raw = self.unicodeToASCII(rawUnicode) Preprocessor.textList['rawText'] = raw file.close() return Preprocessor.textList.get('rawText') def timexTagText(self, altText=None): """Tags all the temporal expressions and surrounds them with <TIMEX2> XML tags in line with the text Args: altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored. Returns: tagged text (str) """ """When altText is specified, the method assumes that some random text is being sent to be tagged, so doesn't save in dictionary""" if altText is not None: raw = altText altOutput = timex.tag(raw) return altOutput else: """Otherwise, we first check if it exists in the textList dict, if not, it is created and returned""" raw = self.rawText() if Preprocessor.textList.get('timexTagText') is None: Preprocessor.textList['timexTagText'] = timex.tag(raw) return Preprocessor.textList.get('timexTagText') def wordTokenizeText(self, altText=None): """Tokenizes all the words currently using the nltk TreebankTokenizer for words, and the Punkt sentence tokenizer. Args: altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored. Returns: tokenized text (nested list, by sentence): ex. [['This', 'is', 'a', 'sentence', '.'],['And', 'maybe', 'another']] """ if altText is not None: raw = altText altTokenizedText = [word_tokenize(t) for t in sent_tokenize(raw)] return altTokenizedText else: raw = self.rawText() if Preprocessor.textList.get('wordTokenizeText') is None: Preprocessor.textList['wordTokenizeText'] = [ word_tokenize(t) for t in sent_tokenize(raw) ] else: print "Didn't create one!!" return Preprocessor.textList.get('wordTokenizeText') def timexTagAndTokenizeText(self, altText=None): """Tags temporal expressions with nltk timex2, and tokenizes the resultant text. Args: altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored. Returns: tokenized text (nested list, by sentence): ex. [['This', 'is', 'a', 'sentence', '.'],['And', 'maybe', 'another']] """ """In this method, two steps are required, so if altText is specified, all steps are done inside the if statement, so incorrect dict entries aren't stored""" if altText is not None: raw = altText altOutputStep1 = self.timexTagText(raw) altOutputStep2 = self.wordTokenizeText(altOutputStep1) time_tagged_and_tokenizedText = MWETokenizer( mwes=[('<', '/TIMEX2', '>'), ('<', 'TIMEX2', '>')], separator='').tokenize(altOutputStep2) return time_tagged_and_tokenizedText else: """Tag all temporal expressions with timex2 tags.""" """Don't need to open file here, because it's opened in timexTagText()""" tagged = self.timexTagText() """Word-tokenize all text above""" word_tagged = self.wordTokenizeText(tagged) '''consolidate all broken apart Timex2 tags into single "words"''' if Preprocessor.textList.get('timexTagAndTokenizeText') is None: nestedListOutput = [ MWETokenizer(mwes=[('<', '/TIMEX2', '>'), ('<', 'TIMEX2', '>')], separator='').tokenize(x) for x in word_tagged ] #We need to remove and change this line if we don't want flattened (one dimensional list). Read below comment. Preprocessor.textList['timexTagAndTokenizeText'] = [ item for sublist in nestedListOutput for item in sublist ] """Currently, the output is a flattened list, we need to decide if we want to keep the sentence structure (making the output a list of lists. This throws off the AEExtractor and the SuspectExtractor, which need to then be fixed.""" return Preprocessor.textList.get('timexTagAndTokenizeText') def posTaggedText(self, altText=None): """Tags the text with parts-of-speech (POS) using the Charniak-Johnson parser after nltk tokenizes the words using the Penn Treebank tokenizer. Args: altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored. Returns: the POS-tagged text (nested list) ex. [[('A', 'DT'), ('female', 'JJ'), ('patient', 'NN'), ('died', 'VBD'), ('while', 'IN'), ('receiving', 'VBG'), ('Taxol', 'NN'), ('therapy', 'NN'), ('.', '.')], [('She', 'PRP'), ('did', 'VBD'), ("n't", 'RB'), ('surive', 'VB'), ('.', '.')]] """ self.parseXML() if altText is not None: raw = altText altOutputStep1 = self.wordTokenizeText(raw) altOutputStep2 = [ Preprocessor.rrp.tag(sent) for sent in altOutputStep1 ] return altOutputStep2 else: if Preprocessor.textList.get('posTaggedText') is None: posTaggedSents = [] paragraphs = self.root.find('Paragraphs') for paragraph in paragraphs.findall('Paragraph'): sentences = paragraph.find('Sentences') for sentence in sentences.findall('Sentence'): tokens = sentence.find('Tokens') #We have to take the first element, because for some reason, wordTokenizeText outputs a nested list, even with only one sentence words = self.wordTokenizeText( sentence.find('Text').text)[0] """We have to check if words is empty or not, otherwise segfault""" if words: posTagList = Preprocessor.rrp.tag(words) posTaggedSents.append(posTagList) for index, token in enumerate( tokens.findall('Token')): token.attrib['POSTag'] = posTagList[index][1] Preprocessor.textList['posTaggedText'] = posTaggedSents self.writeToXML() else: return Preprocessor.textList.get('posTaggedText') def getParseTree(self, altText=None): """ Creates a parse tree using the POS tags in the intermediate XML (the method above) and the Charniak-Johnson parser. Args: altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored. Returns: The parse tree created (str) """ self.parseXML() """In order to use the BLLIP parser (Charniak-Johnson parser) we must tokenize by sentence first. When using the alternate text option you have to only pass it individual sentences, like other methods (TODO: make sure this is the case for other methods) """ if altText is not None: raw = altText altOutputStep1 = self.wordTokenizeText(raw) altParseTree = Preprocessor.rrp.simple_parse(altOutputStep1) return altParseTree else: # Since we are doing an I/O anyway to input the new XML tags, we don't have to retokenize, and can use the information from the base XML document # sent_tokens = sent_tokenize(raw) # output = [rrp.simple_parse(sent) for sent in sent_tokens] if Preprocessor.textList.get('getParseTree') is None: parsedTreeList = [] paragraphs = self.root.find('Paragraphs') for paragraph in paragraphs.findall('Paragraph'): sentences = paragraph.find('Sentences') for sentence in sentences.findall('Sentence'): tempParseTreeElement = ET.Element('ParseTree') # We have to take the first element, because for some reason, wordTokenizeText outputs a nested list, even with only one element text = sentence.find('Text').text """Only going to create a parse tree if there is some alphanumeric character and a period, otherwise parser crashes""" if re.search('\w+\.?', text): tempParseTreeElement.text = Preprocessor.rrp.simple_parse( self.wordTokenizeText(text)[0]) parsedTreeList.append(tempParseTreeElement.text) else: parsedTreeList.append([]) pass """Currently, if the sentence doesn't have any alphanumeric characters (followed by a period), nothing will be entered in the text, but a ParseTree object will still be created and added.""" sentence.append(tempParseTreeElement) Preprocessor.textList['getParseTree'] = parsedTreeList self.writeToXML() else: return Preprocessor.textList.get('getParseTree') def getMetaMapConcepts(self, altText=None): """ Returns the MetaMap concepts found using the 'pymetamap' python wrapper. Args: altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored. Returns: the MetaMap concepts, as described in the pymetamap documentation (list) """ if Preprocessor.textList.get("getMetaMapConcepts") is None: self.parseXML() mm = MetaMap.get_instance('/work/tkakar/public_mm/bin/metamap14') rawText = self.rawText() concepts, error = mm.extract_concepts([rawText]) pattern = re.compile( '(\[(?:(orch|phsu|sosy|dsyn),?(orch|phsu|sosy|dsyn)?)\])') globalIDByConcept = {} returnedList = [] for concept in concepts: if not hasattr(concept, 'aa'): #TODO, see if there is any information that we are missing due to some combination not described by the Regex match = pattern.search(concept.semtypes) if match: returnedList.append(concept) posInfo = concept.pos_info triggerInfo = concept.trigger.split('-') conceptName = triggerInfo[3] #need to replace the quotes in the conceptName conceptName = conceptName.replace('"', '') if ';' or '^' in posInfo: posInfoList = self.offsetParse(posInfo, ';') else: posInfoList = self.offsetParse(posInfo) #We need to change the format of the posInfos from (offset,span) to (offsetStartIndex, offsetEndIndex) here: posInfoList = [(offset, span + offset) for (offset, span) in posInfoList] for listIndex, (startIndex, endIndex) in enumerate(posInfoList): lfNum = rawText.count('\n', 0, startIndex) lastIdx = rawText.rfind( conceptName, 0, startIndex + len(conceptName)) #you're going to forget this tomorrow morning, so this is the number of line feeds between the last instance of the concept name and where metamap thinks the word is. lfNumSpecific = rawText.count( '\n', lastIdx, startIndex) #For some reason, we need to subract one at the end, TODO: Figure out why posInfoList[listIndex] = (startIndex - (lfNum + 1) + lfNumSpecific - 1, endIndex - (lfNum + 1) + lfNumSpecific - 1) globalIDList = [] #we have the fixed offsets for each mention of the semantic type. we now need to find their location in the xml file. for newStartIdx, newEndIdx in posInfoList: # print "newStartIdx: ", newStartIdx # print "newEndIdx: ", newEndIdx globalIds = self.placeOffsetInXML( conceptName, word_tokenize(conceptName), newStartIdx, newEndIdx - newStartIdx) globalIDList.append(globalIds) globalIDByConcept[concept] = globalIDList for key, value in globalIDByConcept.iteritems(): for gIDList in value: for gID in gIDList: conceptXMLTag = self.root.find(".//*[@globalID='" + str(gID) + "']") tempMetaMapElem = ET.Element("METAMAP") tempMetaMapElem.text = key.semtypes.replace("'", '') conceptXMLTag.append(tempMetaMapElem) Preprocessor.textList['getMetaMapConcepts'] = returnedList self.writeToXML() return Preprocessor.textList.get('getMetaMapConcepts') def writeToXML(self): """Writes the tree to the output xml specified. Args: None Returns: None """ self.tree.write(self.xmlname) #, encoding='utf-8') def parseXML(self): """Parses the XML tree in the xml file specified. This method was created to minimize file I/Os. Args: None Returns: None """ self.tree = ET.parse( self.xmlname) #, parser=XMLParser(encoding='utf-8')) self.root = self.tree.getroot() def getRoot(self): self.parseXML() return self.root def placeOffsetInXML(self, phrase, tokenizedText, offset, span): """Takes a word/phrase and finds the globalIDs of the tokens in the intermediate XML that this word/phrase corresponds to. Args: phrase (str) The string to be placed in XML tokenizedText (list) The tokenized text is used to ensure that the same tokenizer used on the rest of the document is kept consistent. offset (int) The offset, in relation to the original text file span (int) The length of the string (currently unused) Returns: List of globalIDs (for tokens) that match the phrase (list) """ self.parseXML() tokenLength = len(tokenizedText) tokens = self.root.findall(".//Token") idsReturned = 0 globalIDList = [] foundOffsetFlag = False for token in tokens: if idsReturned >= tokenLength: break #In this case, we only ever get one offset at a time, so we don't loop through them. Just take the first (and only) element. (tokenStart, tokenEnd) = self.offsetParse(token.attrib['offset'])[0] if (offset == tokenStart or foundOffsetFlag): foundOffsetFlag = True globalIDList.append(int(token.attrib['globalID'])) idsReturned += 1 return globalIDList def offsetParse(self, offsetStr, delimiter=None): """Finds the offset and returns a tuple of starting and ending indices based on XML Format (0:34). Support multiple offsets, with delimiter specified. Returns in list format, even with only one element to keep consistency""" offsetIntList = [] if delimiter is not None: """For some reason, the case where offsetParse() is used in the MetaMap preprocessing, sometimes the delimiter (that is normally a colon[:]) is replaced (randomly, it seems) or by a carrot (^) The regex below is support for that. """ offsetList = re.split( delimiter.encode('string-escape') + r'|\^', offsetStr) for offset in offsetList: if ':' in offset: colonLoc = offset.find(':') offsetlist = [ int(offset[0:colonLoc]), int(offset[colonLoc + 1:len(offset)]) ] offsetIntList.append(offsetlist) return offsetIntList else: colonLoc = offsetStr.find(':') return [(int(offsetStr[0:colonLoc]), int(offsetStr[colonLoc + 1:len(offsetStr)]))] def unicodeToASCII(self, string): """We are going to work solely in ascii, as it's easier for certain methods (i.e. word tokenization)""" string = string.replace(u"\u2019", r"'") string = string.replace(u"\u201C", r'"') string = string.replace(u"\u201D", r'"') string = string.replace(u"\u2013", r'-') #degrees string = string.replace(u"\u00B0", r'^') return string
def __init__(self): self.rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=True) self.sd = StanfordDependencies.get_instance(backend='subprocess')