def getCorefChains(self, doc, dictSentCoref, a_tree_bank): common = cUtil() sentsDict = common.getSentences(a_tree_bank) #dictSentCoref format: {docID: {sentNo: [(start_word_index,end_word_index,type,identifier,string,used_up),...]}} for corefC in doc: docID = str(corefC.document_id) docID = docID.split('/')[3].split('@')[0] if not dictSentCoref.has_key(docID): dictSentCoref[docID] = {} corefChainID = int(corefC.identifier) for corefL in corefC: sentNo = corefL.sentence_index + 1 corefStringWords = corefL.string.split(' ') start_word_index = corefL.start_word_index if corefStringWords[0].lower() in ['the','mrs','ms','miss','mr','mrs.','ms.','mr.']: corefStringWords.pop(0) start_word_index += 1 string = ' '.join(corefStringWords) # nschneid: modified, was ''' deduct = 0 for word in corefStringWords: if word.lower() not in ['the','mrs.','mr.']: string += word + ' ' else: deduct += 1 string = string.rstrip(' ') start_word_index = corefL.start_word_index + deduct ''' end_word_index = corefL.end_word_index + 1 #print "COREFL", corefL.start_word_index, corefL.end_word_index, corefL.start_token_index, corefL.end_token_index adjusted = common.adjustIndices(sentsDict[docID][sentNo][1], sentsDict[docID][sentNo][0], start_word_index, end_word_index) start_word_index = adjusted[0] end_word_index = adjusted[1] if not corefL.type in ['ATTRIB','HEAD','APPOS']: if not dictSentCoref[docID].has_key(sentNo): # First time sentence seen dictSentCoref[docID][sentNo] = [(start_word_index, end_word_index, corefChainID, string, False)] else: # Entries for sentence already in dictionary dictSentCoref[docID][sentNo].append((start_word_index, end_word_index, corefChainID, string, False)) # Sort the list of tuples for each sentence in the dictionary so that they are ordered by start_word_index for docID in dictSentCoref: for sentNo in dictSentCoref[docID]: tempList = dictSentCoref[docID][sentNo] tempList.sort() fDist = {} listCorefMult = [] listCorefChains = [] listFinal = [] # Now swap any items so that if there are two elements with the same start_word_index, the one with the larger span appears first if len(tempList) > 1: for i in range(0,len(tempList)-1): if tempList[i][0] == tempList[i+1][0] and tempList[i][1] < tempList[i+1][1]: tempList[i+1], tempList[i] = tempList[i], tempList[i+1] # As we are only interested in sentence internal coreference, compile a list of corefChainIDs in the sentence listCorefChains.append(int(tempList[i][2])) # Add corefChainID for the last entry in the list listCorefChains.append(int(tempList[i+1][2])) fDist = nltk.FreqDist(listCorefChains) # Find how many times each corefChainID appears in the sentence for e in fDist: if fDist[e] > 1: listCorefMult.append(e) # Construct list of sentence internal coreference instances and write back to dictionary for element in tempList: if int(element[2]) in listCorefMult: listFinal.append(element) dictSentCoref[docID][sentNo] = listFinal return dictSentCoref
def getCorefChains(self, corefDict, docID, sentsDict): #print "corefDict, docID, sentsDict", corefDict, docID, sentsDict common = cUtil() dictSentCoref = {} # {docID: {sentNo: [(start_word_index,end_word_index,type,identifier,string,used_up),...]}} lCorefChains = [] lCorefChainTemp = [] for sentenceNr, sentenceCoref in enumerate(corefDict): for lCorefChain in sentenceCoref: lCorefChainTemp = [] for lCorefPair in lCorefChain: refExpr = lCorefPair[0] antecedent = lCorefPair[1] refExpr[1] = sentenceNr # fix because they are always 0 as we process sentences individually antecedent[1] = sentenceNr if not antecedent in lCorefChainTemp: lCorefChainTemp.append(antecedent) if not refExpr[0] in antecedent[0]: # Exclude referring expressions if they are contained within the antecedent OR overlap lCorefChainTemp.append(refExpr) lCorefChains.append(lCorefChainTemp) identifier = 0 # The ID of the chain if not dictSentCoref.has_key(docID): dictSentCoref[docID] = {} # {docID: {sentNo: [(start_word_index,end_word_index,identifier,string,used_up),...]}} for listChain in lCorefChains: for coref in listChain: sentNo = int(coref[1]) + 1 start_word_index = coref[3] corefStringWords = coref[0].split(' ') if corefStringWords[0].lower() in ['the']: corefStringWords.pop(0) start_word_index += 1 string = ' '.join(corefStringWords) # nschneid: modified; was: ''' string = coref[0] corefStringWords = string.split(' ') deduct = 0 string = '' for word in corefStringWords: if word.lower() not in ['the']: string += word + ' ' else: deduct += 1 string = string.rstrip(' ') start_word_index = coref[3] + deduct ''' end_word_index = coref[4] adjusted = common.adjustIndices(sentsDict[docID][sentNo][1], sentsDict[docID][sentNo][0], start_word_index, end_word_index) start_word_index = adjusted[0] end_word_index = adjusted[1] if not dictSentCoref[docID].has_key(sentNo): dictSentCoref[docID][sentNo] = [(start_word_index,end_word_index,identifier,string,False)] else: dictSentCoref[docID][sentNo].append((start_word_index,end_word_index,identifier,string,False)) identifier += 1 dictSentCoref = self.formatChains(dictSentCoref) return dictSentCoref
def getNamedEnts(self,docID,folderLocationNE,folderLocationDep,treebank): dictNameEnt = {} # {docID: {sentNo: [(start_word_index, end_word_index, named_entity_string, type, sub_type, descriptor, consumed?),...]}} # Pre-load the Stanford dependency parse files as dependency information will be used in the construction of AMR fragments dictDepParse = self.buildDepDict(docID,folderLocationDep) #print 'dictDepParse:', dictDepParse common = cUtil() # Get "clean" and "tree" sentences - both are used in computing the word indices dictSents = common.getSentences(treebank) # Extract Named Entities from the BBN corpus files: sentNo = 1 # was 0 bRead = False prevStartIndices = [] # Pick the named entity file according to the doc ID - there are 4 files per OntoNotes "folder" quadrant = '' docExt = int(docID[-2:]) if docExt < 25: quadrant = 'a' elif 25 <= docExt < 50: quadrant = 'b' elif 50 <= docExt < 75: quadrant = 'c' else: quadrant = 'd' # Open and read in named entity XML file fileName = 'wsj' + docID[4:6] + quadrant + '.qa' nameEntFile = open(folderLocationNE + '/' + fileName, 'r') for line in nameEntFile: #print line if '<DOCNO>' in line: # Obtain docID and reset list of previous start indices prevStartIndices = [] docIDInFile = ((line.split('> ')[1].split(' <')[0])[:3] + '_' + (line.split('> ')[1].split(' <')[0])[3:]).lower() if docIDInFile == docID: bRead = True else: bRead = False continue if 'DOC>' in line or 'ROOT>' in line: continue if bRead == True: # If this is the document that you are looking for... print 'looking at line', line if line.startswith("' "): line = line[1:] line = line.rstrip('\n').rstrip('\r').rstrip(' ').replace(' ','') # Extract clean string and return as a list of words (from which start and end indices can be extracted) #wordList = self.getWordList(line) #print dictSents #print docID #print dictSents[docID] #print dictSents[docID][sentNo] treeString = dictSents[docID][sentNo][0] cleanString = dictSents[docID][sentNo][1] line = self.add_spaces(cleanString, line) line = '<SENTENCE>' + line + '</SENTENCE>' # Wrappers so that each sentence can be read as a separate XML string print line parsedLine = parseString(line) #parseTree = ElementTree.parse(line) # Retrieve the first xml tag (<tag>data</tag>) that the parser finds with the matching name: for element in chain( parsedLine.getElementsByTagName('ENAMEX'), parsedLine.getElementsByTagName('NUMEX'), parsedLine.getElementsByTagName('TIMEX')): dependents = [] xmlTag = element.toxml() xmlTagName = xmlTag.split()[0][1:] xmlTagContent = xmlTag.split('>')[1] xmlTagContent = xmlTagContent.split('<')[0] xmlTagAttr = xmlTag.split('=\"')[1].split('\">')[0] listTagAttr = xmlTagAttr.split(':') entType = listTagAttr[0] if len(listTagAttr) > 1: entSubType = listTagAttr[1] else: entSubType = '' # Find start and end word index from the 'clean' string verion of the line indices = self.getWordIndices(xmlTagContent,cleanString.split(' '),prevStartIndices) #print sentNo #print xmlTagContent #print indices # Get dependents of the words in the xmlTagContent - for PERSON, ORGANIZATION, GPE, PRODUCT and FAC types (which can take a descriptor) if '<ENAMEX' in xmlTag: if entType in ['PERSON', 'ORGANIZATION', 'GPE', 'PRODUCT', 'FAC']: taggedWordList = xmlTagContent.split(' ') cleanWordList = cleanString.split(' ') for taggedIndex in range(indices[0],indices[1]): if dictDepParse[docID][sentNo].has_key(taggedIndex): for depIndex in dictDepParse[docID][sentNo][taggedIndex]: dependent = cleanWordList[depIndex] if dependent not in taggedWordList: dependents.append(dependent) prevStartIndices.append(indices[0]) # Adjust indices to use 'tree' string indices, not clean string indices indices = common.adjustIndices(cleanString, treeString, indices[0], indices[1]) #print indices #print '---------' if not dictNameEnt.has_key(docID): dictNameEnt[docID] = {} if not dictNameEnt[docID].has_key(sentNo): dictNameEnt[docID][sentNo] = [] dictNameEnt[docID][sentNo].append((indices[0],indices[1],xmlTagContent,entType,entSubType,dependents,xmlTag)) prevStartIndices = [] sentNo += 1 nameEntFile.close() # Modify dictNameEnt entries to merge PERSON, ORGANIZATION, GPE, PRODUCT and FAC entities with their entity description NEs (X_DESC) dictNameEnt = self.mergeEntDesc(dictNameEnt) print dictNameEnt return dictNameEnt