def deleteRecord(entName): # dbObj = mdb.mongodbDatabase('doc_collection') # col = dbObj.docCollection # col.delete_many({"primaryEnt":entName}) # dbObj_tmp = mdb.mongodbDatabase('tmp_collection') # col_tmp = dbObj_tmp.docCollection # col_tmp.delete_many({"primaryEnt":entName}) dbObj_final_c = mdb.mongodbDatabase('final_triples') col_final_triples = dbObj_final_c.docCollection col_final_triples.delete_many({'primaryEnt':entName}) dbObj_final_triples = mdb.mongodbDatabase('cluster_info') col_final_triples = dbObj_final_triples.docCollection col_final_triples.delete_many({'primaryEnt':entName}) dbObj_all_ext_collection_new = mdb.mongodbDatabase('all_ext_collection_new') col_all_ext_collection_new = dbObj_all_ext_collection_new.docCollection col_all_ext_collection_new.delete_many({'primaryEnt':entName}) dbObj_triples_collection = mdb.mongodbDatabase('triples_collection') col_triples_collection = dbObj_triples_collection.docCollection triples = col_triples_collection.find({'primaryEnt':entName}) ids = set() for triple in triples: ids.add(triple.get('_id')) for del_id in ids: col_triples_collection.delete_one({'_id':del_id})
def posNewRelations(): cluster_obj = mdb.mongodbDatabase('cluster_info') cluster_col = cluster_obj.docCollection fe_db = mdb.mongodbDatabase('final_triples') final_col = fe_db.docCollection flag=0 entNumber = 0 keyList = nearEntityMapInCanopy.keys(); outputEntityList = [] outputLine = set() for i in keyList: ndlist = nearEntityMapInCanopy.get(i) for ndset in ndlist: # print "ndlist size",len(ndlist) if(len(ndset)==1): entNumber = ndset.pop() ndset.add(entNumber) curOutputList,outputLine = someRandomFunction(entNumber,outputLine,i) if curOutputList != None and len(curOutputList) != 0: outputEntityList.append(curOutputList) clusterone = cluster_col.find_one({'primaryEnt':entSearch,'url':urlIdList[entNumber],'key':i}) tmpdoc = {'primaryEnt':entSearch,'url':urlIdList[entNumber],'similar_facts':[curOutputList],'key':i} if clusterone == None: cluster_col.insert_one(tmpdoc) else: cluster_col.replace_one({'primaryEnt':entSearch,'url':urlIdList[entNumber],'key':i},tmpdoc,True) #outputEntityList.append([newEnt1,mid1,relList[entNumber],newEnt2,mid2,isPrimaryEnt]) else: clusterList = clusterRelation(ndset) for subSets in clusterList: if(len(subSets)>=1): entNumber = SelectEntity(subSets) curOutputList,outputLine = someRandomFunction(entNumber,outputLine,i) if curOutputList != None and len(curOutputList) != 0: outputEntityList.append(curOutputList) allOutputList = [] # print "len of set",len(subSets) for eno in subSets: allOutputList.append([ent1List[eno],relList[eno],ent2List[eno],probList[eno],urlIdList[eno]]) clusterone = cluster_col.find_one({'primaryEnt':entSearch,'url':urlIdList[entNumber],'key':i}) if clusterone == None: cluster_col.insert_one({'primaryEnt':entSearch,'url':urlIdList[entNumber],'similar_facts':allOutputList,'key':i}) else: cluster_col.replace_one({'primaryEnt':entSearch,'url':urlIdList[entNumber],'key':i},{'primaryEnt':entSearch,'url':urlIdList[entNumber],'similar_facts':allOutputList,'key':i},True) #fw = open('extractions/'+ entSearch+'/data/output/'+entSearch+'outputEnt.csv', 'w') #fileWriter = csv.writer(fw) #fileWriter.writerows(outputEntityList) #fw.close() oldVal = final_col.find_one({'primaryEnt':entSearch}) if oldVal == None: final_col.insert_one({'primaryEnt':entSearch,'final-triples':outputEntityList}) else: d = {'primaryEnt':entSearch,'final-triples':outputEntityList} final_col.replace_one({'primaryEnt':entSearch},d,True) cluster_obj.client.close() fe_db.client.close()
def getRelation(relPhrase,type1, type2): dbObj = mdb.mongodbDatabase('nell_collection') col = dbObj.docCollection typeDbObj = mdb.mongodbDatabase('ontology_collection') typeCol = typeDbObj.docCollection words = relPhrase.split(' ') min = 1000000 #some big number. I am too optimistic here by selecting a such a big number reqWord = None finalNellRelations = set() # to get the word which has less number of relations # in relation "want to" for w in words: val = col.find_one({'word':w}) if val != None: relList = val.get('list') if len(relList) < min: reqWord = w min = len(relList) nellRelDict = {} if reqWord != None: #print "req word for ", relPhrase , " is ", reqWord # get list of phrases containing 'reqword' val = col.find_one({'word':reqWord}) if val != None: relList = val.get('list') for rel in relList: isPresent = searchWord(relPhrase, rel[0]) if isPresent: finalNellRelations.add(rel[1]) #print "nell relation for ", relPhrase, " is ", rel[1] d = typeCol.find_one({'rel':rel[1].lower()}) #get type of the nell relation if d != None: nellType1 = d.get('domain') nellType2 = d.get('range') ## if relPhrase == "moved to": ## print nellType1, " ", nellType2, " ", rel[1]," ",relPhrase if type1 == None: type1 = [] if type2 == None: type2 = [] if (nellType1 in type1) and (nellType2 in type2): freq = nellRelDict.get(rel[1]) if freq == None: freq = 1 else: freq = freq + 1 nellRelDict.update({rel[1]:freq}) return finalNellRelations, nellRelDict
def collectEntities(primaryEnt, url): print "inside getent" global dbObj dbObj = mdb.mongodbDatabase("triples_collection") allExt = mdb.mongodbDatabase("all_ext_collection") allExtCol = allExt.docCollection extObj = allExtCol.find_one({"primaryEnt": primaryEnt, "url": url}) if extObj == None: print "No extractions", primaryEnt return None data = extObj.get("extList") ent1List = [] ent2List = [] relList = [] probList = [] for line in data: line = line.encode("utf-8", "ignore") if len(line) > 1: # if the line has some string result = getRelationAndEntity(line) if result != None: ereList = result.split("_") if len(ereList[2].split(" ")) < 7 and len(ereList[3].split(" ")) < 8: e2 = ereList[3].strip() r = ereList[2].strip() try: words1 = word_tokenize(e2) postag1 = nltk.pos_tag(words1) if len(postag1) > 0: w1 = postag1[0] if w1[1] == "IN" or w1[1] == "PREP" or w1[1] == "TO": tmp = e2.split(" ") e2 = " ".join(tmp[1:]) r = r + " " + str(tmp[0]) ent1List.append(ereList[1]) # store ent1, rel and ent2 ent2List.append(e2) relList.append(r) probList.append(ereList[0]) # print ereList[1], " --> ", r else: probList.append(ereList[0]) ent1List.append(ereList[1]) # store ent1, rel and ent2 ent2List.append(ereList[3]) relList.append(ereList[2]) except Exception, e: print "error ", e
def collectEntities(primaryEnt): global dbObj dbObj = mdb.mongodbDatabase('triples_collection') allExt = mdb.mongodbDatabase('all_ext_collection') allExtCol = allExt.docCollection extObj = allExtCol.find_one({'primaryEnt':primaryEnt}) if extObj == None: print "No extractions", primaryEnt return None ## p = file('input/final/scientists.txt', 'a') ## p.write(primaryEnt) ## p.close() data = extObj.get('extList') #data = open(filename).readlines() ent1List = []; ent2List = []; relList = []; probList = [] for line in data: if len(line) > 1: #if the line has some string result = getRelationAndEntity(line); if(result != None): ereList = result.split("_") if(len(ereList[2].split(' ')) < 7 and len(ereList[3].split(' ')) < 8): e2 = ereList[3].strip() r = ereList[2].strip() try: words1 = word_tokenize(e2); postag1 = nltk.pos_tag(words1) if(len(postag1)>0): w1 = postag1[0] if(w1[1] == "IN" or w1[1] == "PREP" or w1[1] == "TO"): tmp = e2.split(' ') e2 = ' '.join(tmp[1:]) r = r + " " + str(tmp[0]) ent1List.append(ereList[1]); #store ent1, rel and ent2 ent2List.append(e2); relList.append(r); probList.append(ereList[0]) #print ereList[1], " --> ", r else: probList.append(ereList[0]); ent1List.append(ereList[1]); #store ent1, rel and ent2 ent2List.append(ereList[3]); relList.append(ereList[2]); except Exception,e: print "error ",e
def entityClusterAndNormalise(ent): global entSearch global goalEntity global entity1ToFreebaseId global entity2ToFreebaseId global ent1List global ent2List global relList global newRelation global nearEntityMapInCanopy global wordCountDict global wordToCanopyNo global canopySetDict global entityToCanopyMap global clusterSetInCanopy global newEntityList global dbObj dbObj = mdb.mongodbDatabase('triples_collection') ent1List = [] ent2List = [] relList = [] newRelation = {} nearEntityMapInCanopy = {} entity1ToFreebaseId = {} entity2ToFreebaseId = {} wordCountDict={} wordToCanopyNo = {} canopySetDict = {} entityToCanopyMap = {} clusterSetInCanopy = {} newEntityList = [] goalEntity = [] entSearch = ent words = word_tokenize(entSearch) postag = pos_tag(words) for w1 in postag: if((w1[1].startswith("NN") or w1[1] == "JJ" or w1[1]=="CD") and len(w1[0]) > 1): goalEntity.append(w1[0].lower()) if len(goalEntity)==0: print "no key word in goal entity",entSearch return InitialSetup(); entity1ToFreebaseId = searchClueweb(ent1List,entity1ToFreebaseId); entity2ToFreebaseId = searchClueweb(ent2List,entity2ToFreebaseId); clusterInCanopy() MergeClusters() posNewRelations() printTheGraph() dbObj.client.close()
def getTypeHierarchy(enttype): dbObj = mdb.mongodbDatabase('type_hierarchy_collection') col = dbObj.docCollection allTypeList = [] for ent in enttype: allTypeList.append(ent) val = col.find_one({'ent':ent}) if val != None: tl = val.get('typelist') for t in tl: if t not in allTypeList: allTypeList.append(t) return allTypeList
def inference_test(entSearch): global finalList print "inside linking stage" dbObj = mdb.mongodbDatabase('final_triples') col = dbObj.docCollection entList = [] vals = col.find_one({'primaryEnt':entSearch}) if vals == None: print "No extractions" count = 0 dbObj.client.close() return False else: data = vals.get('final-triples') if len(data) > 0 : getNellRelations(data,entSearch) nellMapObj = mdb.mongodbDatabase('nell_mapped_triples_collection') nellMapCol = nellMapObj.docCollection oldTriples = nellMapCol.find_one({'primaryEnt':entSearch}) if oldTriples == None: nellMapCol.insert_one({'primaryEnt':entSearch, 'mapped-triples':finalList}) else: nellMapCol.replace_one({'primaryEnt':entSearch},{'primaryEnt':entSearch, 'mapped-triples':finalList},True) # outputFileName = 'output/'+entSearch.replace(' ','_') +'.csv' # fw = open(outputFileName, 'w') # fileWriter = csv.writer(fw) # fileWriter.writerows(finalList) # finalList = [] # fw.close() finalList = [] nellMapObj.client.close() dbObj.client.close() return True
def extractDataFromLink(queue, urls, filename, fileCount): dbObj = mdb.mongodbDatabase('doc_collection') docs = dbObj.docCollection down_doc = docs.find_one({'url':urls,'primaryEnt':filename}) if(down_doc == None or (down_doc['documents'] == None) or len(down_doc['documents'])==0): try: #print "down load docs for ",urls cleanText = '' if(urls.endswith('.pdf')): print "############# found pdf #############" proxy_support = urllib2.ProxyHandler({"http":"proxy.iisc.ernet.in:3128"}) opener = urllib2.build_opener(proxy_support) urllib2.install_opener(opener) with open('filename','wb') as f: f.write(urllib2.urlopen(URL).read()) f.close() content = convert('filename') cleanText = content.encode('utf-8','ignore') else: extractor = Extractor(extractor='ArticleExtractor', url=urls) extracted_text = extractor.getText() cleanText = cleanTheExtraction(extracted_text) sentenceList = tokenizer.tokenize(cleanText) #get sentences if(len(sentenceList) > minLen): # write to a file if the extraction size is greater than min no. of sentences curFile = filename+str(fileCount)+'.txt' senList = [] for l in sentenceList: newl = l.encode('utf-8','ignore') senList.append(newl) document = {'url': urls, 'documents':senList, 'primaryEnt':filename} if down_doc == None: post_id = docs.insert_one(document) #.inserted_id else: docs.replace_one({'url': urls, 'primaryEnt':filename},document,True) sentenceString = ' '.join(sentenceList) getTripleList(sentenceString,urls,filename)# call a function to do corenlp->sentcreate->ollie except Exception, e: print "error in boilerpipe code: ",e," url: ", urls exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno)
def inference_test(entSearch): global finalList dbObj = mdb.mongodbDatabase('final_triples') col = dbObj.docCollection entList = [] vals = col.find_one({'primaryEnt':entSearch}) if vals == None: print "No extractions" count = 0 else: data = vals.get('final-triples') if len(data) > 0 : getNellRelations(data,entSearch) outputFileName = entSearch.replace(' ','_') +'.csv' fw = open(outputFileName, 'w') fileWriter = csv.writer(fw) fileWriter.writerows(finalList) finalList = [] fw.close() dbObj.client.close()
def getTripleList(sentenceList,url,priEnt): global currentUrl global primaryEnt global dbObj dbObj = mdb.mongodbDatabase('tmp_collection') col = dbObj.docCollection currentUrl = url primaryEnt = priEnt if(col.find_one({'url':url,'primaryEnt':priEnt}) == None): print "calling coref resolution for ",url corefResolution(sentenceList) else: oldval = col.find_one({'url':url,'primaryEnt':priEnt}) openieobj = oldval.get('openie') if len(openieobj) == 0 or openieobj == '': print "calling openie for", primaryEnt, "url ", url tmpobj = col.find_one({'url':url,'primaryEnt':priEnt}) corefdata = tmpobj.get('corenlp') SentenceConstructionFromXML(corefdata) dbObj.client.close()
def getType(ent): dbObj = mdb.mongodbDatabase('ent_type_collection') col = dbObj.docCollection val = col.find_one({'ent':ent}) if val != None: return val.get('type') else: suffix = ent; t = None while(t==None): try: words = suffix.split(' ') suffix = ' '.join(words[1:]) t=col.find_one({'ent':suffix}) if t != None: dbObj.client.close() return t.get('type') except Exception,e: dbObj.client.close() print "GetType Error",e return None
def extractDataFromLink(queue, urls, filename, fileCount): dbObj = mdb.mongodbDatabase('doc_collection') docs = dbObj.docCollection down_doc = docs.find_one({'url':urls,'primaryEnt':filename}) if(down_doc == None or (down_doc['documents'] == None) or len(down_doc['documents'])==0): try: extractor = Extractor(extractor='ArticleExtractor', url=urls) extracted_text = extractor.getText() cleanText = cleanTheExtraction(extracted_text) sentenceList = tokenizer.tokenize(cleanText) #get sentences if(len(sentenceList) > minLen): # write to a file if the extraction size is greater than min no. of sentences curFile = filename+str(fileCount)+'.txt' # p = file('/tmp/extractions/'+curFile, 'w') for s in sentenceList: try: if(ord(s) < 48 or ord(s) > 122): sentenceList.remove(s) else: print "@@@@@",s # p.write(s) # p.write(" ") except: sentenceList.remove(s) # p.close() document = {'url': urls, 'documents':sentenceList, 'primaryEnt':filename} if down_doc == None: post_id = docs.insert_one(document) #.inserted_id else: docs.replace_one({'url': urls, 'primaryEnt':filename},document,True) sentenceString = ' '.join(sentenceList) getTripleList(sentenceString,urls,filename)# call a function to do corenlp->sentcreate->ollie except Exception, e: # print "whats the error ",e # print urls pass
def printToFile(sentenceToExtractionMap,sentencewiseCorefResultDict,xmlFN): global allExt allExt = mdb.mongodbDatabase('all_ext_collection') allExtCol = allExt.docCollection extObj = allExtCol.find_one({'primaryEnt':primaryEnt}) if extObj == None: finalList = [] else: finalList = extObj.get('extList') keySet = sentenceToExtractionMap.keys() for elemnt in keySet: extlist = sentenceToExtractionMap.get(elemnt) for e in range(0,len(extlist),1): if len(extlist[e]) > 0 and extlist[e] != '\n': extractionLine = extlist[e].strip('\n') finalList.append(extractionLine) if extObj == None: allExtCol.insert_one({'primaryEnt':primaryEnt, 'extList':finalList}) else: d = {'primaryEnt':primaryEnt, 'extList':finalList} allExtCol.replace_one({'primaryEnt':primaryEnt},d,True) allExt.client.close()
def mapEtractionsToNell(q,line, entSearch): dbObj = mdb.mongodbDatabase('ent_type_collection') col = dbObj.docCollection nellExt = mdb.mongodbDatabase('map_collection') mapcol = nellExt.docCollection # print "line",line if len(line) >= 6: outputEntityList = [] ent1 = line[0].strip() ent2 = line[2].strip() rel = line[1].strip() url = line[4] clusterID = line[5] ent1type = getType(ent1.lower()) ent2type = getType(ent2.lower()) # print ent1, ent1type # print ent2, ent2type ent1type_hier = getTypeHierarchy(ent1type) ent2type_hier = getTypeHierarchy(ent2type) nellRelSet, freqDict = getRelation(rel, ent1type_hier, ent2type_hier) setDictList = [nellRelSet,freqDict] entType = 0 relType = 0 if ent1.lower() in entSearch or entSearch in ent1.lower(): val = col.find_one({'ent':ent2.lower()}) if val != None: entType = 1 else: entType = 2 else: val = col.find_one({'ent':ent1.lower()}) if val != None: entType = 1 else: entType = 2 ## print line ## print ent1.lower(), "-->", ent1type_hier ## print ent2.lower(), "-->", ent2type_hier if len(nellRelSet) == 0: relType = 2 else: relType = 1 newFact = 1 isnew = mapcol.find({'ent1':ent1}) if isnew != None: for facts in isnew: nelRel = facts.get('rel') nelEnt2 = facts.get('ent2') if nelRel == rel and nelEnt2==ent2: newFact = 0 fact = ent1 + " " + rel outputEntityList.append(ent1) outputEntityList.append(rel) ## for nr in nellRelSet: ## #print rel, " --type-- ", nr ## outputEntityList.append(nr) mx = 0 nellRel = '' predUrl = '' for nr in freqDict.keys(): count = freqDict.get(nr) if count > mx: mx = count nellRel = nr if nellRel != '': outputEntityList.append(nellRel) predUrl = "http://rtw.ml.cmu.edu/rtw/kbbrowser/pred:"+nellRel else: outputEntityList.append('---') ## outputEntityList.append(ent2) fact += " "+ent2 if relType == 1 and entType == 1: extType = 'KR-KE' elif relType == 1 and entType == 2: extType = 'KR-NE' elif relType == 2 and entType == 1: extType = 'NR-KE' elif relType == 2 and entType == 2: extType = 'NR-NE' nellurl = '' typeForurl = '' if extType == 'NR-KE' or extType == 'KR-KE': for t in ent2type: if 'thing' not in t: typeForurl = t break if len(typeForurl) > 0: ent2 = ent2.encode('utf-8','ignore').lower() nellurl = "http://rtw.ml.cmu.edu/rtw/kbbrowser/" + typeForurl.encode('utf-8','ignore').lower()+ ":" + '_'.join(ent2.split(' ')) # print "nellurl",nellurl outputEntityList.append(extType) outputEntityList.append(url) outputEntityList.append(clusterID) outputEntityList.append(nellurl) outputEntityList.append(predUrl) # 0 1 2 3 4 5 6 7 8 # ent1, rel, nellrelation, ent2, exttype, urlof data ext, clusterid, nellurl predurl q.put({rel:outputEntityList})
def ReplaceCorefPointers(primaryEntity): global primaryEnt global extractionList global replaceList global dbObj replaceList = [] print "deref for ",primaryEntity primaryEnt = primaryEntity primaryEntDict = {} setForReplacement = set() filewiseInfoDict = {} primaryEntSet = set() primaryEntSet.add(primaryEntity) ollieOutput = "openieOutputFolder" corefOutput = "corefOutputFolder" dbObj = mdb.mongodbDatabase('tmp_collection') tempCol = dbObj.docCollection colList = tempCol.find({'primaryEnt':primaryEntity}) key = 0; for tmp in colList: ollieDataList = tmp['openie'] corenlpData = tmp['corenlp'] #initialise dictionaries sentenceToExtractionMap = {} sentencewiseCorefResultDict = {} #initialise lists extractionList = [] perSentenceData = [] # holds sentence + all extractions of a sentence from ollie output for ollie in ollieDataList: lines = ollie.split('\n') extractionList.append(lines) for extractionNo, elist in enumerate(extractionList): if(len(elist)>1 and elist[1] != "No extractions found.\n"): #print elist sentenceToExtractionMap.update({extractionNo:elist}) corefOutputList = xmlParseCorefResult(corenlpData) #call xml parser if(len(corefOutputList) != 0): corefPointerList = [] listLen = len(corefOutputList) for i in range(listLen): l = corefOutputList[i] for j in range(len(l)): nounprolist = [] c = l[j] pro = c[0][0] noun = c[1][0] start = c[0][3] end = c[0][4] sentence = c[0][1] #replace pro in sentence at start-end if primaryEntity in noun: nounprolist.append(primaryEntity) else: nounprolist.append(noun) nounprolist.append(pro) nounprolist.append(start) nounprolist.append(end) corefPointerList = sentencewiseCorefResultDict.get(sentence) if corefPointerList == None: corefPointerList = [] corefPointerList.append(nounprolist) sentencewiseCorefResultDict.update({sentence:corefPointerList}) if(len(corefOutputList) == 0): print "No coreference found for " for sentNo in sentencewiseCorefResultDict.keys(): corefPointerList = sentencewiseCorefResultDict.get(sentNo) for npl in corefPointerList: if len(npl) == 4: if npl[1].lower().strip() in primaryEntSet: #print npl[1] l = primaryEntDict.get(npl[1]) if l == None: l = set() l.add(npl[0]) primaryEntDict.update({npl[1]:l}) else: l.add(npl[0]) primaryEntDict.update({npl[1]:l}) #print "ped " + str(primaryEntDict) for pi in primaryEntDict.keys(): l = primaryEntDict.get(pi) if primaryEnt in l: for ent in l: setForReplacement.add(ent) setForReplacement.add(pi) dictlist = [] dictlist.append(sentencewiseCorefResultDict) dictlist.append(sentenceToExtractionMap) filewiseInfoDict.update({key:dictlist}) key = key + 1 nounAfterDict = {} for dicts in filewiseInfoDict.keys(): dictlist = filewiseInfoDict.get(dicts) sentencewiseCorefResultDict = dictlist[0] sentenceToExtractionMap = dictlist[1] for sentNo in sentencewiseCorefResultDict.keys(): corefPointerListFull = sentencewiseCorefResultDict.get(sentNo) multiPointerList,corefPointerList = multiplePronoun(corefPointerListFull) for nounprolist in corefPointerList: if len(nounprolist) != 0: noun = nounprolist[0].strip() pronoun = nounprolist[1].strip() #print "noun ", noun, " pronoun ", pronoun extList = sentenceToExtractionMap.get(sentNo) if(extList != None): #print "extList len ", len(extList) for i in range(1,len(extList),1): # all the extractions of a sentence. Replace in all the sentences. line_i = extList[i] score,ereList = getRelationAndEntity(line_i) #print ereList if(ereList != None): if(len(word_tokenize(noun))>5): #print "noun before ", noun nounafter = nounAfterDict.get(noun) if nounafter == None: try: nounafter = getNounDependency(noun)#get strings connected to root word nounafter = nounafter.strip() nounAfterDict.update({noun:nounafter}) noun = nounafter isReplace, l_index, w_index = ReplacingRules(ereList,noun,pronoun) except Exception,e: nounafter = '' isReplace = False l_index = 0 w_index = 0 #nounAfterDict.update({noun:nounafter}) noun = nounafter print "len loop error",e else: noun = nounafter isReplace, l_index, w_index = ReplacingRules(ereList,noun,pronoun) #print "noun after", noun else: isReplace, l_index, w_index = ReplacingRules(ereList,noun,pronoun) if(isReplace==True and w_index >=0): derefString = ereList[l_index] stringToken = word_tokenize(derefString) if noun in setForReplacement: stringToken[w_index]=primaryEnt replaceList.append([pronoun,primaryEnt]) else: stringToken[w_index]=noun replaceList.append([pronoun,noun]) ereList[l_index] = ' '.join(stringToken) newline_i = score+': ('+ereList[0] + ';'+ereList[1] + ';'+ereList[2] + ')' extList[i] = newline_i elif(isReplace==True and w_index == -1): derefString = ereList[l_index] if noun in setForReplacement: replaceList.append([pronoun,primaryEnt]) derefString = derefString.replace(pronoun,primaryEnt) else: replaceList.append([pronoun,noun]) derefString = derefString.replace(pronoun,noun) ereList[l_index] = derefString newline_i = score+': ('+ereList[0] + ';'+ereList[1] + ';'+ereList[2] + ')' extList[i] = newline_i sentenceToExtractionMap.update({sentNo:extList}) ## This loop is for coref output of type: same pronoun--> multiple nouns in one sentence for mlist in multiPointerList: pronoun = mlist[0] noun1 = mlist[1] noun2 = mlist[2] start1 = mlist[3] start2 = mlist[4] extList = sentenceToExtractionMap.get(sentNo) if(extList != None): for i in range(1,len(extList),1): # all the extractions of a sentence. Replace in all the sentences. line_i = extList[i] score,ereList = getRelationAndEntity(line_i) if(ereList != None): noun = getProperNoun(pronoun,noun1,noun2,ereList,sentNo,corenlpData,start1,start2) score,ereList = getRelationAndEntity(line_i) if(ereList != None): if noun == None: noun = noun1 isReplace, l_index, w_index = ReplacingRules(ereList,noun,pronoun) if(isReplace==True and w_index >=0): derefString = ereList[l_index] stringToken = word_tokenize(derefString) if noun in setForReplacement: stringToken[w_index]=primaryEnt replaceList.append([pronoun,primaryEnt]) else: stringToken[w_index]=noun replaceList.append([pronoun,noun]) ereList[l_index] = ' '.join(stringToken) newline_i = score+': ('+ereList[0] + ';'+ereList[1] + ';'+ereList[2] + ')' extList[i] = newline_i elif(isReplace==True and w_index == -1): derefString = ereList[l_index] if noun in setForReplacement: derefString = derefString.replace(pronoun,primaryEnt) replaceList.append([pronoun,primaryEnt]) else: derefString = derefString.replace(pronoun,noun) replaceList.append([pronoun,noun]) ereList[l_index] = derefString newline_i = score+': ('+ereList[0] + ';'+ereList[1] + ';'+ereList[2] + ')' extList[i] = newline_i sentenceToExtractionMap.update({sentNo:extList}) ####################################################### ## Write the outut to the files ## ####################################################### xmlfileName = str(dicts) +'.txt' printToFile(sentenceToExtractionMap,sentencewiseCorefResultDict,xmlfileName)
def webScraping(entToSearch, queryStrings, extractTriples): global ent_search_wiki_url global reject entTypeList = [] tempSet = set() valuesToSearch = set() valuesToSearch.add(entToSearch) for q in queryStrings: valuesToSearch.add(q) # to get the wikipedia url of the primary entity. # reject other wikipedia urls wikiPageTitleObj = mdb.mongodbDatabase('wikiPageTitle','wikiPageDB') wikiPageTitle_col = wikiPageTitleObj.docCollection link_str = entToSearch + ' ' + ' '.join(queryStrings) wiki_url = 'http://en.wikipedia.org/wiki/' link_ent_dict = link_entities(link_str.lower(),wikiPageTitle_col) ent_title = link_ent_dict[entToSearch.lower()] wikiPageTitleObj.client.close() if len(ent_title) != 0: ent_search_wiki_url = wiki_url + ent_title print "wiki url of primary entity ******",ent_search_wiki_url,"*********" else: print "linked dict",link_ent_dict,"*******" reject = False #if len(valuesToSearch) == 0: #valuesToSearch.add(entToSearch) fileCount = 1 processList = [] q = Queue() fileCount = 1; link_set = set() for qstr in valuesToSearch: if qstr == entToSearch: searchString = "\""+entToSearch+"\"" else: qstr = qstr.strip('\n') searchString = entToSearch + " " + qstr start_time = time.time() linksList_api = getLinks_api_search(searchString,2) print("--- %s api ---" % (time.time() - start_time)) start_time = time.time() linksList_cstm = getLinks_custom_search(searchString) #last int to control the number of links print("--- %s custom ---" % (time.time() - start_time)) start_time = time.time() linksList_cmu = getLinks_cmu_search(searchString) print("--- %s cmu ---" % (time.time() - start_time)) #print "reminder--cmu search disabled" if linksList_api != None: for l in linksList_api: l = l.strip(' ') l = l.strip('\n') link_set.add(l) if linksList_cstm != None: print "kg ",len(linksList_cstm) for l in linksList_cstm: l = l.strip(' ') l = l.strip('\n') link_set.add(l) if linksList_cmu != None: print "cmu ",len(linksList_cmu) for l in linksList_cmu: l = l.strip(' ') l = l.strip('\n') link_set.add(l) print "link count :",len(link_set) # print link_set if link_set != None: for link in link_set: if validLink(link): print "^^^ added",link if extractTriples: newProc = Process(target=extractDataFromLink, args=[q, link, entToSearch,fileCount, extractTriples])# call a function to do corenlp->sentcreate->ollie else: newProc = Process(target=extractSentencesFromLink, args=[q, link, entToSearch,fileCount, extractTriples])# call a function to do corenlp->sentcreate->ollie fileCount += 1; processList.append(newProc) newProc.start() start = time.time() while time.time() - start <= TIMEOUT: if any(p.is_alive() for p in processList): time.sleep(1) # Just to avoid hogging the CPU else: # All the processes are done, break now. break else: # We only enter this if we didn't 'break' above. print("timed out, killing all processes") for p in processList: p.terminate()
def posNewRelations(): fe_db = mdb.mongodbDatabase('final_triples') final_col = fe_db.docCollection flag=0 entNumber = 0 keyList = nearEntityMapInCanopy.keys(); outputEntityList = [] outputLine = set() for i in keyList: ndlist = nearEntityMapInCanopy.get(i) for ndset in ndlist: if(len(ndset)==1): entNumber = ndset.pop() ndset.add(entNumber) mid1 = entity1ToFreebaseId.get(entNumber) mid2 = entity2ToFreebaseId.get(entNumber) if mid1 == None: mid1 = '' if mid2 == None: mid2 = '' ent_flag = oneOrTwo(ent1List[entNumber]) if ent_flag: fb1,filterEnt1 = JKFilterForNoun(ent1List[entNumber],0) fb2,filterEnt2 = JKFilterForNoun(ent2List[entNumber],1) else: fb1,filterEnt1 = JKFilterForNoun(ent1List[entNumber],1) fb2,filterEnt2 = JKFilterForNoun(ent2List[entNumber],0) filterEnt1 = filterEnt1.strip() filterEnt2 = filterEnt2.strip() if (filterEnt1.lower()).count(entSearch.lower()) > 1: filterEnt1 = entSearch if (filterEnt2.lower()).count(entSearch.lower()) > 1: filterEnt2 = entSearch isPrimaryEnt1 = searchPrimaryEntity(filterEnt1) isPrimaryEnt2 = searchPrimaryEntity(filterEnt2) ## print "is primary 1: ",isPrimaryEnt1 ## print "is primary 2: ",isPrimaryEnt2 ## if fb1: ## fb1 = checkValidNoun(filterEnt1) ## if fb2: ## fb2 = checkValidNoun(filterEnt2) if(fb1 and fb2 and len(filterEnt1)>0 and len(filterEnt2)>0 and (isPrimaryEnt1 or isPrimaryEnt2)): if not (filterEnt1 + " " + relList[entNumber] + " " + filterEnt2 in outputLine): outputEntityList.append([filterEnt1,relList[entNumber],filterEnt2,probList[entNumber]]) outputLine.add(filterEnt1 + " " + relList[entNumber] + " " + filterEnt2) #outputEntityList.append([newEnt1,mid1,relList[entNumber],newEnt2,mid2,isPrimaryEnt]) else: clusterList = clusterRelation(ndset) for subSets in clusterList: if(len(subSets)>=1): entNumber = SelectEntity(subSets) mid1 = entity1ToFreebaseId.get(entNumber) mid2 = entity2ToFreebaseId.get(entNumber) if mid1 == None: mid1 = '' if mid2 == None: mid2 = '' ent_flag = oneOrTwo(ent1List[entNumber]) if ent_flag: fb1,filterEnt1 = JKFilterForNoun(ent1List[entNumber],0) fb2,filterEnt2 = JKFilterForNoun(ent2List[entNumber],1) else: fb1,filterEnt1 = JKFilterForNoun(ent1List[entNumber],1) fb2,filterEnt2 = JKFilterForNoun(ent2List[entNumber],0) ## print "sent ", filterEnt1 ## print "sent ", filterEnt2 filterEnt1 = filterEnt1.strip() filterEnt2 = filterEnt2.strip() if (filterEnt1.lower()).count(entSearch.lower()) > 1: filterEnt1 = entSearch if (filterEnt2.lower()).count(entSearch.lower()) > 1: filterEnt2 = entSearch isPrimaryEnt1 = searchPrimaryEntity(filterEnt1) isPrimaryEnt2 = searchPrimaryEntity(filterEnt2) ## if fb1: ## fb1 = checkValidNoun(filterEnt1) ## if fb2: ## fb2 = checkValidNoun(filterEnt2) if(fb1 and fb2 and len(filterEnt1)>0 and len(filterEnt2)>0 and (isPrimaryEnt1 or isPrimaryEnt2)): if not (filterEnt1 + " " + relList[entNumber] + " " + filterEnt2 in outputLine): outputEntityList.append([filterEnt1,relList[entNumber],filterEnt2,probList[entNumber]]) outputLine.add(filterEnt1 + " " + relList[entNumber] + " " + filterEnt2) #fw = open('extractions/'+ entSearch+'/data/output/'+entSearch+'outputEnt.csv', 'w') #fileWriter = csv.writer(fw) #fileWriter.writerows(outputEntityList) #fw.close() oldVal = final_col.find_one({'primaryEnt':entSearch}) if oldVal == None: final_col.insert_one({'primaryEnt':entSearch,'final-triples':outputEntityList}) else: d = {'primaryEnt':entSearch,'final-triples':outputEntityList} final_col.replace_one({'primaryEnt':entSearch},d,True) fe_db.client.close()
def mapEtractionsToNell(line, entSearch): dbObj = mdb.mongodbDatabase('ent_type_collection') col = dbObj.docCollection nellExt = mdb.mongodbDatabase('map_collection') mapcol = nellExt.docCollection outputEntityList = [] ent1 = line[0].strip() ent2 = line[2].strip() rel = line[1].strip() ent1type = getType(ent1.lower()) ent2type = getType(ent2.lower()) # print ent1, ent1type # print ent2, ent2type ent1type_hier = getTypeHierarchy(ent1type) ent2type_hier = getTypeHierarchy(ent2type) nellRelSet, freqDict = getRelation(rel, ent1type_hier, ent2type_hier) setDictList = [nellRelSet,freqDict] entType = 0 relType = 0 if ent1.lower() in entSearch or entSearch in ent1.lower(): val = col.find_one({'ent':ent2.lower()}) if val != None: entType = 1 else: entType = 2 else: val = col.find_one({'ent':ent1.lower()}) if val != None: entType = 1 else: entType = 2 ## print line ## print ent1.lower(), "-->", ent1type_hier ## print ent2.lower(), "-->", ent2type_hier if len(nellRelSet) == 0: relType = 2 else: relType = 1 newFact = 1 isnew = mapcol.find({'ent1':ent1}) if isnew != None: for facts in isnew: nelRel = facts.get('rel') nelEnt2 = facts.get('ent2') if nelRel == rel and nelEnt2==ent2: newFact = 0 fact = ent1 + " " + rel outputEntityList.append(ent1) outputEntityList.append(rel) ## for nr in nellRelSet: ## #print rel, " --type-- ", nr ## outputEntityList.append(nr) mx = 0 nellRel = '' for nr in freqDict.keys(): count = freqDict.get(nr) if count > mx: mx = count nellRel = nr if nellRel != '': outputEntityList.append(nellRel) ## outputEntityList.append(ent2) fact += " "+ent2 if relType == 1 and entType == 1: extType = 1 elif relType == 1 and entType == 2: extType = 2 elif relType == 2 and entType == 1: extType = 3 elif relType == 2 and entType == 2: extType = 4 outputEntityList.append(extType) ## if newFact == 1: ## outputEntityList.append('new') return outputEntityList