Exemplo n.º 1
0
def deleteRecord(entName):
    # dbObj = mdb.mongodbDatabase('doc_collection')
    # col = dbObj.docCollection
    # col.delete_many({"primaryEnt":entName})

    # dbObj_tmp = mdb.mongodbDatabase('tmp_collection')
    # col_tmp = dbObj_tmp.docCollection
    # col_tmp.delete_many({"primaryEnt":entName})

    dbObj_final_c = mdb.mongodbDatabase('final_triples')
    col_final_triples = dbObj_final_c.docCollection
    col_final_triples.delete_many({'primaryEnt':entName})

    dbObj_final_triples = mdb.mongodbDatabase('cluster_info')
    col_final_triples = dbObj_final_triples.docCollection
    col_final_triples.delete_many({'primaryEnt':entName})


    dbObj_all_ext_collection_new = mdb.mongodbDatabase('all_ext_collection_new')
    col_all_ext_collection_new = dbObj_all_ext_collection_new.docCollection
    col_all_ext_collection_new.delete_many({'primaryEnt':entName})

    dbObj_triples_collection = mdb.mongodbDatabase('triples_collection')
    col_triples_collection = dbObj_triples_collection.docCollection
    triples = col_triples_collection.find({'primaryEnt':entName})
    ids = set()
    for triple in triples:
        ids.add(triple.get('_id'))
    for del_id in ids:
        col_triples_collection.delete_one({'_id':del_id})
def posNewRelations():
    cluster_obj = mdb.mongodbDatabase('cluster_info')
    cluster_col = cluster_obj.docCollection

    fe_db = mdb.mongodbDatabase('final_triples')
    final_col = fe_db.docCollection
    
    flag=0
    entNumber = 0
    keyList = nearEntityMapInCanopy.keys();
    outputEntityList = []
    outputLine = set()
    for i in keyList:
        ndlist = nearEntityMapInCanopy.get(i)
        for ndset in ndlist:
            # print "ndlist size",len(ndlist)
            if(len(ndset)==1):
                entNumber = ndset.pop()
                ndset.add(entNumber)
                curOutputList,outputLine = someRandomFunction(entNumber,outputLine,i)
                if curOutputList != None and len(curOutputList) != 0:
                    outputEntityList.append(curOutputList)
                    clusterone = cluster_col.find_one({'primaryEnt':entSearch,'url':urlIdList[entNumber],'key':i})
                    tmpdoc = {'primaryEnt':entSearch,'url':urlIdList[entNumber],'similar_facts':[curOutputList],'key':i}
                    if clusterone == None:
                        cluster_col.insert_one(tmpdoc)
                    else:
                        cluster_col.replace_one({'primaryEnt':entSearch,'url':urlIdList[entNumber],'key':i},tmpdoc,True)
            #outputEntityList.append([newEnt1,mid1,relList[entNumber],newEnt2,mid2,isPrimaryEnt])
            else:
                clusterList = clusterRelation(ndset)
                for subSets in clusterList:
                    if(len(subSets)>=1):
                        entNumber = SelectEntity(subSets)
                        curOutputList,outputLine = someRandomFunction(entNumber,outputLine,i)
                        if curOutputList != None and len(curOutputList) != 0:
                            outputEntityList.append(curOutputList)
                            allOutputList = []
                            # print "len of set",len(subSets)
                            for eno in subSets:
                                allOutputList.append([ent1List[eno],relList[eno],ent2List[eno],probList[eno],urlIdList[eno]])

                            clusterone = cluster_col.find_one({'primaryEnt':entSearch,'url':urlIdList[entNumber],'key':i})
                            if clusterone == None:
                                cluster_col.insert_one({'primaryEnt':entSearch,'url':urlIdList[entNumber],'similar_facts':allOutputList,'key':i})
                            else:
                                cluster_col.replace_one({'primaryEnt':entSearch,'url':urlIdList[entNumber],'key':i},{'primaryEnt':entSearch,'url':urlIdList[entNumber],'similar_facts':allOutputList,'key':i},True)
    #fw = open('extractions/'+ entSearch+'/data/output/'+entSearch+'outputEnt.csv', 'w')
    #fileWriter = csv.writer(fw)
    #fileWriter.writerows(outputEntityList)
    #fw.close()
    oldVal = final_col.find_one({'primaryEnt':entSearch})
    if oldVal == None:
        final_col.insert_one({'primaryEnt':entSearch,'final-triples':outputEntityList})
    else:
        d = {'primaryEnt':entSearch,'final-triples':outputEntityList}
        final_col.replace_one({'primaryEnt':entSearch},d,True)
    cluster_obj.client.close()
    fe_db.client.close()
def getRelation(relPhrase,type1, type2):
    dbObj = mdb.mongodbDatabase('nell_collection')
    col = dbObj.docCollection

    typeDbObj = mdb.mongodbDatabase('ontology_collection')
    typeCol = typeDbObj.docCollection

    words = relPhrase.split(' ')
    min = 1000000   #some big number. I am too optimistic here by selecting a such a big number
    reqWord = None
    finalNellRelations = set()
    # to get the word which has less number of relations
    # in relation "want to"
    for w in words:
        val = col.find_one({'word':w})
        if val != None:
            relList = val.get('list')
            if len(relList) < min:
                reqWord = w
                min = len(relList)
    nellRelDict = {}
    if reqWord != None:
        #print "req word for ", relPhrase , " is ", reqWord 
        # get list of phrases containing 'reqword' 
        val = col.find_one({'word':reqWord})
        if val != None:
            relList = val.get('list')
            for rel in relList:
                isPresent = searchWord(relPhrase, rel[0])
                if isPresent:
                    finalNellRelations.add(rel[1])
                    #print "nell relation for ", relPhrase, " is ", rel[1]
                    d = typeCol.find_one({'rel':rel[1].lower()}) #get type of the nell relation
                    if d != None:
                        nellType1 = d.get('domain')
                        nellType2 = d.get('range')
##                        if relPhrase == "moved to":
##                            print nellType1, " ", nellType2, " ", rel[1]," ",relPhrase
                        if type1 == None:
                            type1 = []
                        if type2 == None:
                            type2 = []
                        if (nellType1 in type1) and (nellType2 in type2):
                            freq = nellRelDict.get(rel[1])
                            if freq == None:
                                freq = 1
                            else:
                                freq = freq + 1
                            nellRelDict.update({rel[1]:freq})
                            

    return finalNellRelations, nellRelDict
def collectEntities(primaryEnt, url):
    print "inside getent"
    global dbObj
    dbObj = mdb.mongodbDatabase("triples_collection")

    allExt = mdb.mongodbDatabase("all_ext_collection")
    allExtCol = allExt.docCollection
    extObj = allExtCol.find_one({"primaryEnt": primaryEnt, "url": url})
    if extObj == None:
        print "No extractions", primaryEnt
        return None

    data = extObj.get("extList")
    ent1List = []
    ent2List = []
    relList = []
    probList = []

    for line in data:
        line = line.encode("utf-8", "ignore")
        if len(line) > 1:  # if the line has some string
            result = getRelationAndEntity(line)
            if result != None:
                ereList = result.split("_")
                if len(ereList[2].split(" ")) < 7 and len(ereList[3].split(" ")) < 8:
                    e2 = ereList[3].strip()
                    r = ereList[2].strip()
                    try:
                        words1 = word_tokenize(e2)
                        postag1 = nltk.pos_tag(words1)
                        if len(postag1) > 0:
                            w1 = postag1[0]
                            if w1[1] == "IN" or w1[1] == "PREP" or w1[1] == "TO":
                                tmp = e2.split(" ")
                                e2 = " ".join(tmp[1:])
                                r = r + " " + str(tmp[0])
                                ent1List.append(ereList[1])
                                # store ent1, rel and ent2
                                ent2List.append(e2)
                                relList.append(r)
                                probList.append(ereList[0])
                                # print ereList[1], " --> ", r
                            else:
                                probList.append(ereList[0])
                                ent1List.append(ereList[1])
                                # store ent1, rel and ent2
                                ent2List.append(ereList[3])
                                relList.append(ereList[2])
                    except Exception, e:
                        print "error ", e
def collectEntities(primaryEnt):
    global dbObj
    dbObj = mdb.mongodbDatabase('triples_collection')
    allExt = mdb.mongodbDatabase('all_ext_collection')
    allExtCol = allExt.docCollection
    extObj = allExtCol.find_one({'primaryEnt':primaryEnt})
    if extObj == None:
        print "No extractions", primaryEnt
        return None
    
##    p = file('input/final/scientists.txt', 'a')
##    p.write(primaryEnt)
##    p.close()

    data = extObj.get('extList')
    #data = open(filename).readlines()
    ent1List = [];
    ent2List = [];
    relList = [];
    probList = []
    for line in data:
      if len(line) > 1:                                 #if the line has some string
        result = getRelationAndEntity(line);
        if(result != None):
            ereList = result.split("_")      
            if(len(ereList[2].split(' ')) < 7 and len(ereList[3].split(' ')) < 8):
                e2 = ereList[3].strip()
                r = ereList[2].strip()
                try:
                    words1 = word_tokenize(e2);
                    postag1 = nltk.pos_tag(words1)
                    if(len(postag1)>0):
                        w1 = postag1[0]
                        if(w1[1] == "IN" or w1[1] == "PREP" or w1[1] == "TO"):
                            tmp = e2.split(' ')
                            e2 = ' '.join(tmp[1:])
                            r = r + " " + str(tmp[0])
                            ent1List.append(ereList[1]);                         #store ent1, rel and ent2
                            ent2List.append(e2);
                            relList.append(r);
                            probList.append(ereList[0])
                            #print ereList[1], " --> ", r
                        else:
                            probList.append(ereList[0]);
                            ent1List.append(ereList[1]);                         #store ent1, rel and ent2
                            ent2List.append(ereList[3]);
                            relList.append(ereList[2]);
                except Exception,e:
                    print "error ",e
def entityClusterAndNormalise(ent):
    global entSearch
    global goalEntity	
    global entity1ToFreebaseId
    global entity2ToFreebaseId
    global ent1List
    global ent2List
    global relList
    global newRelation
    global nearEntityMapInCanopy
    global wordCountDict
    global wordToCanopyNo
    global canopySetDict
    global entityToCanopyMap
    global clusterSetInCanopy
    global newEntityList
    global dbObj
    
    dbObj =  mdb.mongodbDatabase('triples_collection')
    ent1List = []
    ent2List = []
    relList = []
    newRelation = {}
    nearEntityMapInCanopy = {}
    entity1ToFreebaseId = {}
    entity2ToFreebaseId = {}
    wordCountDict={}
    wordToCanopyNo = {}
    canopySetDict = {}
    entityToCanopyMap = {}
    clusterSetInCanopy = {}
    newEntityList = []
    goalEntity = []

    entSearch  = ent
    words = word_tokenize(entSearch)
    postag = pos_tag(words)
    for w1 in postag:
        if((w1[1].startswith("NN") or w1[1] == "JJ" or w1[1]=="CD") and len(w1[0]) > 1):
            goalEntity.append(w1[0].lower())
    if len(goalEntity)==0:
        print "no key word in goal entity",entSearch
        return
    InitialSetup();
    entity1ToFreebaseId =  searchClueweb(ent1List,entity1ToFreebaseId);
    entity2ToFreebaseId =  searchClueweb(ent2List,entity2ToFreebaseId);
    clusterInCanopy()
    MergeClusters()
    posNewRelations()
    printTheGraph()
    dbObj.client.close()
def getTypeHierarchy(enttype):
    dbObj = mdb.mongodbDatabase('type_hierarchy_collection')
    col = dbObj.docCollection
    allTypeList = []
    for ent in enttype:
        allTypeList.append(ent)

        val = col.find_one({'ent':ent})
        if val != None:
            tl = val.get('typelist')
            for t in tl:
                if t not in allTypeList:
                    allTypeList.append(t)
    return allTypeList
def inference_test(entSearch):
    global finalList
    print "inside linking stage"
    dbObj = mdb.mongodbDatabase('final_triples')
    col = dbObj.docCollection
    entList = []
    vals = col.find_one({'primaryEnt':entSearch})
    if vals == None:
        print "No extractions"
        count = 0
        dbObj.client.close()
        return False
    else:
        data = vals.get('final-triples')
        if len(data) > 0 :
            getNellRelations(data,entSearch)

            nellMapObj = mdb.mongodbDatabase('nell_mapped_triples_collection')
            nellMapCol = nellMapObj.docCollection

            oldTriples = nellMapCol.find_one({'primaryEnt':entSearch})
            if oldTriples == None:
                nellMapCol.insert_one({'primaryEnt':entSearch, 'mapped-triples':finalList})
            else:
                nellMapCol.replace_one({'primaryEnt':entSearch},{'primaryEnt':entSearch, 'mapped-triples':finalList},True)

            # outputFileName = 'output/'+entSearch.replace(' ','_') +'.csv'
            # fw = open(outputFileName, 'w')
            # fileWriter = csv.writer(fw)
            # fileWriter.writerows(finalList)
            # finalList = []
            # fw.close()
            finalList = []
            nellMapObj.client.close()
            dbObj.client.close()
            return True
def extractDataFromLink(queue, urls, filename, fileCount):
    dbObj = mdb.mongodbDatabase('doc_collection')
    docs = dbObj.docCollection
    down_doc = docs.find_one({'url':urls,'primaryEnt':filename})

    if(down_doc == None or (down_doc['documents'] == None) or len(down_doc['documents'])==0):
        try:
        #print "down load docs for ",urls
            cleanText = ''
            if(urls.endswith('.pdf')):
                print "############# found pdf #############"
                proxy_support = urllib2.ProxyHandler({"http":"proxy.iisc.ernet.in:3128"})
                opener = urllib2.build_opener(proxy_support)
                urllib2.install_opener(opener)
                with open('filename','wb') as f:
                    f.write(urllib2.urlopen(URL).read())
                    f.close()
                content = convert('filename')
                cleanText = content.encode('utf-8','ignore')
            else:
                extractor = Extractor(extractor='ArticleExtractor', url=urls)
                extracted_text = extractor.getText()
                cleanText = cleanTheExtraction(extracted_text)

            sentenceList = tokenizer.tokenize(cleanText)    #get sentences

            if(len(sentenceList) > minLen):           # write to a file if the extraction size is greater than min no. of sentences
                curFile = filename+str(fileCount)+'.txt'
                senList = []
                for l in sentenceList:
                    newl = l.encode('utf-8','ignore')
                    senList.append(newl)

                document = {'url': urls, 'documents':senList, 'primaryEnt':filename}
                if down_doc == None:
                    post_id = docs.insert_one(document) #.inserted_id
                else:
                    docs.replace_one({'url': urls, 'primaryEnt':filename},document,True)

                sentenceString = ' '.join(sentenceList)
                getTripleList(sentenceString,urls,filename)# call a function to do corenlp->sentcreate->ollie
        except Exception, e:
            print "error in boilerpipe code: ",e," url: ", urls
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            print(exc_type, fname, exc_tb.tb_lineno)
def inference_test(entSearch):
    global finalList
    dbObj = mdb.mongodbDatabase('final_triples')
    col = dbObj.docCollection
    entList = []
    vals = col.find_one({'primaryEnt':entSearch})
    if vals == None:
        print "No extractions"
        count = 0
    else:
        data = vals.get('final-triples')
        if len(data) > 0 :
            getNellRelations(data,entSearch)
            outputFileName = entSearch.replace(' ','_') +'.csv'
            fw = open(outputFileName, 'w')
            fileWriter = csv.writer(fw)
            fileWriter.writerows(finalList)
            finalList = []
            fw.close()
    dbObj.client.close()
def getTripleList(sentenceList,url,priEnt):
    global currentUrl
    global primaryEnt
    global dbObj
    dbObj = mdb.mongodbDatabase('tmp_collection')
    col = dbObj.docCollection 
    
    currentUrl = url
    primaryEnt = priEnt
    if(col.find_one({'url':url,'primaryEnt':priEnt}) == None):
    	print "calling coref resolution for ",url
    	corefResolution(sentenceList)
    else:
        oldval = col.find_one({'url':url,'primaryEnt':priEnt})
        openieobj = oldval.get('openie')
        if len(openieobj) == 0 or openieobj == '':
            print "calling openie for", primaryEnt, "url ", url
            tmpobj = col.find_one({'url':url,'primaryEnt':priEnt})
            corefdata = tmpobj.get('corenlp')
            SentenceConstructionFromXML(corefdata)
    dbObj.client.close()
def getType(ent):
    dbObj = mdb.mongodbDatabase('ent_type_collection')
    col = dbObj.docCollection
    val = col.find_one({'ent':ent})
    if val != None:
        return val.get('type')
    else:
        suffix = ent;
        t = None
        while(t==None):
            try:
                words = suffix.split(' ')
                suffix = ' '.join(words[1:])
                t=col.find_one({'ent':suffix})
                if t != None:
                    dbObj.client.close()
                    return t.get('type')
            except Exception,e:
                dbObj.client.close()
                print "GetType Error",e
                return None
def extractDataFromLink(queue, urls, filename, fileCount):
    dbObj = mdb.mongodbDatabase('doc_collection')
    docs = dbObj.docCollection
    down_doc = docs.find_one({'url':urls,'primaryEnt':filename})
    
    if(down_doc == None or (down_doc['documents'] == None) or len(down_doc['documents'])==0):
        try:
            extractor = Extractor(extractor='ArticleExtractor', url=urls)
            extracted_text = extractor.getText()
            cleanText = cleanTheExtraction(extracted_text)
            sentenceList = tokenizer.tokenize(cleanText)    #get sentences
            
            if(len(sentenceList) > minLen):           # write to a file if the extraction size is greater than min no. of sentences
                curFile = filename+str(fileCount)+'.txt'
                # p = file('/tmp/extractions/'+curFile, 'w')
                for s in sentenceList:
                    try:
                        if(ord(s) < 48 or ord(s) > 122):
                            sentenceList.remove(s)
                        else:
                            print "@@@@@",s 
                #         p.write(s)
                #         p.write(" ")
                    except:
                        sentenceList.remove(s)
                # p.close()

                document = {'url': urls, 'documents':sentenceList, 'primaryEnt':filename}
                if down_doc == None:
                    post_id = docs.insert_one(document) #.inserted_id
                else:
                    docs.replace_one({'url': urls, 'primaryEnt':filename},document,True)
                
                sentenceString = ' '.join(sentenceList)
                getTripleList(sentenceString,urls,filename)# call a function to do corenlp->sentcreate->ollie   		
        except Exception, e:
            # print "whats the error ",e
            # print urls
            pass
def printToFile(sentenceToExtractionMap,sentencewiseCorefResultDict,xmlFN):
    global allExt
    allExt = mdb.mongodbDatabase('all_ext_collection')
    allExtCol = allExt.docCollection
    
    extObj = allExtCol.find_one({'primaryEnt':primaryEnt})
    if extObj == None:
        finalList = []
    else:
        finalList = extObj.get('extList')
    keySet = sentenceToExtractionMap.keys()
    for elemnt in keySet:
        extlist = sentenceToExtractionMap.get(elemnt)
        for e in range(0,len(extlist),1):
            if len(extlist[e]) > 0 and extlist[e] != '\n':
                extractionLine = extlist[e].strip('\n')
                finalList.append(extractionLine)
    if extObj == None:
        allExtCol.insert_one({'primaryEnt':primaryEnt, 'extList':finalList})
    else:
        d = {'primaryEnt':primaryEnt, 'extList':finalList}
        allExtCol.replace_one({'primaryEnt':primaryEnt},d,True)
    allExt.client.close()
def mapEtractionsToNell(q,line, entSearch):
    dbObj = mdb.mongodbDatabase('ent_type_collection')
    col = dbObj.docCollection
    nellExt = mdb.mongodbDatabase('map_collection')
    mapcol = nellExt.docCollection
    # print "line",line
    if len(line) >= 6:
        outputEntityList = []
        ent1 = line[0].strip()
        ent2 = line[2].strip()
        rel = line[1].strip()
        url = line[4]
        clusterID = line[5]

        ent1type = getType(ent1.lower())
        ent2type = getType(ent2.lower())
        # print ent1, ent1type
        # print ent2, ent2type
        
                
        ent1type_hier = getTypeHierarchy(ent1type)
        ent2type_hier = getTypeHierarchy(ent2type)

        nellRelSet, freqDict = getRelation(rel, ent1type_hier, ent2type_hier)
        setDictList = [nellRelSet,freqDict]

        entType = 0
        relType = 0
        if ent1.lower() in entSearch or entSearch in ent1.lower():
            val = col.find_one({'ent':ent2.lower()})
            if val != None:
                entType = 1
            else:
                entType = 2
        else:
            val = col.find_one({'ent':ent1.lower()})
            if val != None:
                entType = 1
            else:
                entType = 2

    ##    print line
    ##    print ent1.lower(), "-->", ent1type_hier
    ##    print ent2.lower(), "-->", ent2type_hier
        if len(nellRelSet) == 0:
            relType = 2
        else:
            relType = 1

        newFact = 1
        isnew = mapcol.find({'ent1':ent1})
        if isnew != None:
            for facts in isnew:
                nelRel = facts.get('rel')
                nelEnt2 = facts.get('ent2')
                if nelRel == rel and nelEnt2==ent2:
                    newFact = 0
        fact = ent1 + " " + rel
        outputEntityList.append(ent1)
        outputEntityList.append(rel)

    ##    for nr in nellRelSet:
    ##        #print rel, " --type-- ", nr
    ##        outputEntityList.append(nr)

        mx = 0
        nellRel = ''
        predUrl = ''
        for nr in freqDict.keys():
            count = freqDict.get(nr)
            if count > mx:
                mx = count
                nellRel = nr
        if nellRel != '':
            outputEntityList.append(nellRel)
            predUrl = "http://rtw.ml.cmu.edu/rtw/kbbrowser/pred:"+nellRel
        else:
            outputEntityList.append('---')
    ##
        outputEntityList.append(ent2)
        fact += " "+ent2
        if relType == 1 and entType == 1:
            extType = 'KR-KE'
        elif relType == 1 and entType == 2:
            extType = 'KR-NE'
        elif relType == 2 and entType == 1:
            extType = 'NR-KE'
        elif relType == 2 and entType == 2:
            extType = 'NR-NE'

        nellurl = ''
        typeForurl = ''
        if extType == 'NR-KE' or extType == 'KR-KE':
            for t in ent2type:
                if 'thing' not in t:
                    typeForurl = t
                    break
            if len(typeForurl) > 0:
                ent2 = ent2.encode('utf-8','ignore').lower()
                nellurl = "http://rtw.ml.cmu.edu/rtw/kbbrowser/" + typeForurl.encode('utf-8','ignore').lower()+ ":" + '_'.join(ent2.split(' '))
                # print "nellurl",nellurl

        outputEntityList.append(extType)
        outputEntityList.append(url)
        outputEntityList.append(clusterID)
        outputEntityList.append(nellurl)
        outputEntityList.append(predUrl)
        # 0     1       2           3       4       5               6           7       8
        # ent1, rel, nellrelation, ent2, exttype, urlof data ext, clusterid, nellurl predurl
        q.put({rel:outputEntityList})
def ReplaceCorefPointers(primaryEntity):
    global primaryEnt
    global extractionList
    global replaceList
    global dbObj
    replaceList = []
    print "deref for ",primaryEntity
    primaryEnt = primaryEntity
    primaryEntDict = {}
    setForReplacement = set()
    
    filewiseInfoDict = {}

    primaryEntSet = set()
    primaryEntSet.add(primaryEntity)
    
    ollieOutput = "openieOutputFolder"
    corefOutput = "corefOutputFolder"
    
    dbObj = mdb.mongodbDatabase('tmp_collection')
    tempCol = dbObj.docCollection
    colList = tempCol.find({'primaryEnt':primaryEntity})
    key = 0;
    for tmp in colList:
        ollieDataList = tmp['openie']
        corenlpData = tmp['corenlp']
        
        #initialise dictionaries
        sentenceToExtractionMap = {} 
        sentencewiseCorefResultDict = {}
        #initialise lists
        extractionList = []
                
        perSentenceData = []        # holds sentence + all extractions of a sentence from ollie output
        
        for ollie in ollieDataList:
            lines = ollie.split('\n')
            
            extractionList.append(lines)
        
        for extractionNo, elist in enumerate(extractionList):
            if(len(elist)>1 and elist[1] != "No extractions found.\n"):
                #print elist
                sentenceToExtractionMap.update({extractionNo:elist})

        corefOutputList = xmlParseCorefResult(corenlpData)      #call xml parser
        if(len(corefOutputList) != 0):
            corefPointerList = []
            listLen = len(corefOutputList)
            for i in range(listLen):
                l = corefOutputList[i]
                
                for j in range(len(l)):
                    nounprolist = []
                    c = l[j]
                    pro = c[0][0]
                    noun = c[1][0]
                    start = c[0][3]
                    end = c[0][4]
                    sentence = c[0][1] #replace pro in sentence at start-end
                    if primaryEntity in noun:
                    	nounprolist.append(primaryEntity)
                    else:
                    	nounprolist.append(noun)
                    nounprolist.append(pro)
                    nounprolist.append(start)
                    nounprolist.append(end)
                    corefPointerList = sentencewiseCorefResultDict.get(sentence)
                    if corefPointerList == None:
                        corefPointerList = []
                    corefPointerList.append(nounprolist)
                    sentencewiseCorefResultDict.update({sentence:corefPointerList})
        if(len(corefOutputList) == 0):
            print "No coreference found for "
        
        for sentNo in sentencewiseCorefResultDict.keys():
            corefPointerList = sentencewiseCorefResultDict.get(sentNo)
            for npl in corefPointerList:
                if len(npl) == 4:
                    if npl[1].lower().strip() in primaryEntSet:
                        #print npl[1]
                        l = primaryEntDict.get(npl[1])
                        if l == None:
                            l = set()
                            l.add(npl[0])
                            primaryEntDict.update({npl[1]:l})     
                        else:
                            l.add(npl[0])
                            primaryEntDict.update({npl[1]:l})
        #print "ped " + str(primaryEntDict)
        for pi in primaryEntDict.keys():
            l = primaryEntDict.get(pi)
            if primaryEnt in l:
                for ent in l:
                    setForReplacement.add(ent)
                setForReplacement.add(pi)
        dictlist = []
        dictlist.append(sentencewiseCorefResultDict)
        dictlist.append(sentenceToExtractionMap)
        filewiseInfoDict.update({key:dictlist})
        key = key + 1
    nounAfterDict = {}
    for dicts in filewiseInfoDict.keys():
        dictlist = filewiseInfoDict.get(dicts)
        sentencewiseCorefResultDict = dictlist[0]
        sentenceToExtractionMap = dictlist[1]           
        for sentNo in sentencewiseCorefResultDict.keys():
            corefPointerListFull = sentencewiseCorefResultDict.get(sentNo)
            multiPointerList,corefPointerList = multiplePronoun(corefPointerListFull)
            for nounprolist in corefPointerList:
                if len(nounprolist) != 0:    
                    noun = nounprolist[0].strip()
                    pronoun = nounprolist[1].strip()
                    #print "noun ", noun, " pronoun ", pronoun
                    extList = sentenceToExtractionMap.get(sentNo)
                    
                    if(extList != None):
                        #print "extList len ", len(extList)
                        for i in range(1,len(extList),1):       # all the extractions of a sentence. Replace in all the sentences.
                            line_i = extList[i]
                            score,ereList = getRelationAndEntity(line_i)
                            #print ereList
                            if(ereList != None):
                                if(len(word_tokenize(noun))>5):
                                    #print "noun before ", noun
                                    nounafter = nounAfterDict.get(noun)
                                    if nounafter == None:
                                        try:
                                            nounafter = getNounDependency(noun)#get strings connected to root word
                                            nounafter = nounafter.strip()
                                            nounAfterDict.update({noun:nounafter})
                                            noun = nounafter
                                            isReplace, l_index, w_index = ReplacingRules(ereList,noun,pronoun)
                                        except Exception,e:
                                            nounafter = ''
                                            isReplace = False
                                            l_index = 0
                                            w_index = 0
                                            #nounAfterDict.update({noun:nounafter})
                                            noun = nounafter
                                            print "len loop error",e    
                                    else:
                                        noun = nounafter
                                        isReplace, l_index, w_index = ReplacingRules(ereList,noun,pronoun)
                                    #print "noun after", noun

                                else:
                                    isReplace, l_index, w_index = ReplacingRules(ereList,noun,pronoun)
                                if(isReplace==True and w_index >=0):
                                    derefString = ereList[l_index]
                                    stringToken = word_tokenize(derefString)
                                    if noun in setForReplacement:
                                        stringToken[w_index]=primaryEnt
                                        replaceList.append([pronoun,primaryEnt])
                                    else:
                                        stringToken[w_index]=noun
                                        replaceList.append([pronoun,noun])
                                    
                                    ereList[l_index] = ' '.join(stringToken)
                                    newline_i = score+': ('+ereList[0] + ';'+ereList[1] + ';'+ereList[2] + ')'
                                    extList[i] = newline_i
                                elif(isReplace==True and w_index == -1):
                                    derefString = ereList[l_index]
                                    if noun in setForReplacement:
                                        replaceList.append([pronoun,primaryEnt])
                                        derefString = derefString.replace(pronoun,primaryEnt)
                                    else:
                                        replaceList.append([pronoun,noun])
                                        derefString = derefString.replace(pronoun,noun)
                                    
                                    ereList[l_index] = derefString
                                    newline_i = score+': ('+ereList[0] + ';'+ereList[1] + ';'+ereList[2] + ')'
                                    extList[i] = newline_i

                        sentenceToExtractionMap.update({sentNo:extList})

## This loop is for coref output of type: same pronoun--> multiple nouns in one sentence
            for mlist in multiPointerList:
                pronoun = mlist[0]
                noun1 = mlist[1]
                noun2  = mlist[2]
                start1 = mlist[3]
                start2 = mlist[4]
                extList = sentenceToExtractionMap.get(sentNo)
                if(extList != None):
                    for i in range(1,len(extList),1):       # all the extractions of a sentence. Replace in all the sentences.
                        line_i = extList[i]
                        score,ereList = getRelationAndEntity(line_i)
                        if(ereList != None):
                            noun = getProperNoun(pronoun,noun1,noun2,ereList,sentNo,corenlpData,start1,start2)
                            score,ereList = getRelationAndEntity(line_i)
                            if(ereList != None):
                                if noun == None:
                                    noun = noun1
                                isReplace, l_index, w_index = ReplacingRules(ereList,noun,pronoun)
                                if(isReplace==True and w_index >=0):
                                    derefString = ereList[l_index]
                                    stringToken = word_tokenize(derefString)
                                    if noun in setForReplacement:
                                        stringToken[w_index]=primaryEnt
                                        replaceList.append([pronoun,primaryEnt])
                                    else:
                                        stringToken[w_index]=noun
                                        replaceList.append([pronoun,noun])
                                        
                                    ereList[l_index] = ' '.join(stringToken)
                                    newline_i = score+': ('+ereList[0] + ';'+ereList[1] + ';'+ereList[2] + ')'
                                    extList[i] = newline_i
                                elif(isReplace==True and w_index == -1):
                                    derefString = ereList[l_index]
                                    if noun in setForReplacement:
                                        derefString = derefString.replace(pronoun,primaryEnt)
                                        replaceList.append([pronoun,primaryEnt])
                                    else:
                                        derefString = derefString.replace(pronoun,noun)
                                        replaceList.append([pronoun,noun])
                                        
                                    ereList[l_index] = derefString
                                    newline_i = score+': ('+ereList[0] + ';'+ereList[1] + ';'+ereList[2] + ')'
                                    extList[i] = newline_i
                    sentenceToExtractionMap.update({sentNo:extList})
        
#######################################################
##           Write the outut to the files            ##
#######################################################
        xmlfileName = str(dicts) +'.txt'
        printToFile(sentenceToExtractionMap,sentencewiseCorefResultDict,xmlfileName)
Exemplo n.º 17
0
def webScraping(entToSearch, queryStrings, extractTriples):
    global ent_search_wiki_url
    global reject
    entTypeList = []

    tempSet = set()
    valuesToSearch = set()
    valuesToSearch.add(entToSearch)
    for q in queryStrings:
        valuesToSearch.add(q)

    # to get the wikipedia url of the primary entity.
    # reject other wikipedia urls
    wikiPageTitleObj = mdb.mongodbDatabase('wikiPageTitle','wikiPageDB')
    wikiPageTitle_col = wikiPageTitleObj.docCollection

    link_str = entToSearch + ' ' + ' '.join(queryStrings)
    wiki_url = 'http://en.wikipedia.org/wiki/'
    link_ent_dict = link_entities(link_str.lower(),wikiPageTitle_col)
    ent_title = link_ent_dict[entToSearch.lower()]
    
    wikiPageTitleObj.client.close()
    
    if len(ent_title) != 0:
        ent_search_wiki_url = wiki_url + ent_title
        print "wiki url of primary entity ******",ent_search_wiki_url,"*********"
    else:
        print "linked dict",link_ent_dict,"*******"
        reject = False
    #if len(valuesToSearch) == 0:
        #valuesToSearch.add(entToSearch)

    fileCount = 1

    processList = []
    q = Queue()
    fileCount = 1;
    link_set = set()
    for qstr in valuesToSearch:
        if qstr == entToSearch:
            searchString = "\""+entToSearch+"\""
        else:
            qstr = qstr.strip('\n')
            searchString = entToSearch + " " + qstr
        start_time = time.time()
        linksList_api = getLinks_api_search(searchString,2)
        print("--- %s api ---" % (time.time() - start_time))
        start_time = time.time()
        linksList_cstm = getLinks_custom_search(searchString)   #last int to control the number of links
        print("--- %s custom ---" % (time.time() - start_time))
        start_time = time.time()
        linksList_cmu = getLinks_cmu_search(searchString)
        print("--- %s cmu ---" % (time.time() - start_time))
        #print "reminder--cmu search disabled"

        if linksList_api != None:
            for l in linksList_api:
                l = l.strip(' ')
                l = l.strip('\n')
                link_set.add(l)

        if linksList_cstm != None:
            print "kg ",len(linksList_cstm)
            for l in linksList_cstm:
                l = l.strip(' ')
                l = l.strip('\n')
                link_set.add(l)

        if linksList_cmu != None:
            print "cmu ",len(linksList_cmu)
            for l in linksList_cmu:
                l = l.strip(' ')
                l = l.strip('\n')
                link_set.add(l)

        print "link count :",len(link_set)
        # print link_set
    
    if link_set != None:
        for link in link_set:
            if validLink(link):
                print "^^^ added",link
                if extractTriples:
                    newProc = Process(target=extractDataFromLink, args=[q, link, entToSearch,fileCount, extractTriples])# call a function to do corenlp->sentcreate->ollie
                else:
                    newProc = Process(target=extractSentencesFromLink, args=[q, link, entToSearch,fileCount, extractTriples])# call a function to do corenlp->sentcreate->ollie
                fileCount += 1;
                processList.append(newProc)
                newProc.start()
    
    start = time.time()
    while time.time() - start <= TIMEOUT:
        if any(p.is_alive() for p in processList):
            time.sleep(1)  # Just to avoid hogging the CPU
        else:
            # All the processes are done, break now.
            break
    else:
        # We only enter this if we didn't 'break' above.
        print("timed out, killing all processes")
        for p in processList:
            p.terminate()
def posNewRelations():
    fe_db = mdb.mongodbDatabase('final_triples')
    final_col = fe_db.docCollection
    flag=0
    entNumber = 0
    keyList = nearEntityMapInCanopy.keys();
    outputEntityList = []
    outputLine = set()
    for i in keyList:
        ndlist = nearEntityMapInCanopy.get(i)
        for ndset in ndlist:
            if(len(ndset)==1):
                entNumber = ndset.pop()
		ndset.add(entNumber)
                mid1 = entity1ToFreebaseId.get(entNumber)
                mid2 = entity2ToFreebaseId.get(entNumber)
                if mid1 == None:
                    mid1 = ''
                if mid2 == None:
                    mid2 = ''
                
                ent_flag = oneOrTwo(ent1List[entNumber])
                if ent_flag:
                    fb1,filterEnt1 = JKFilterForNoun(ent1List[entNumber],0)
                    fb2,filterEnt2 = JKFilterForNoun(ent2List[entNumber],1)
                else:
                    fb1,filterEnt1 = JKFilterForNoun(ent1List[entNumber],1)
                    fb2,filterEnt2 = JKFilterForNoun(ent2List[entNumber],0)
                
                filterEnt1 = filterEnt1.strip()
                filterEnt2 = filterEnt2.strip()
                if (filterEnt1.lower()).count(entSearch.lower()) > 1:
                    filterEnt1 = entSearch
                if (filterEnt2.lower()).count(entSearch.lower()) > 1:
                    filterEnt2 = entSearch
                
                isPrimaryEnt1 = searchPrimaryEntity(filterEnt1)
                isPrimaryEnt2 = searchPrimaryEntity(filterEnt2)
##                print "is primary 1: ",isPrimaryEnt1
##                print "is primary 2: ",isPrimaryEnt2
                
##                if fb1:
##                    fb1 = checkValidNoun(filterEnt1)
##                if fb2:
##                    fb2 = checkValidNoun(filterEnt2)

                if(fb1 and fb2 and len(filterEnt1)>0 and len(filterEnt2)>0 and (isPrimaryEnt1 or isPrimaryEnt2)):
                    if not (filterEnt1 + " " + relList[entNumber] + " " + filterEnt2 in outputLine):
                        outputEntityList.append([filterEnt1,relList[entNumber],filterEnt2,probList[entNumber]])
                        outputLine.add(filterEnt1 + " " + relList[entNumber] + " " + filterEnt2)
            #outputEntityList.append([newEnt1,mid1,relList[entNumber],newEnt2,mid2,isPrimaryEnt])
            else:
                clusterList = clusterRelation(ndset)
                for subSets in clusterList:
                    if(len(subSets)>=1):
                        entNumber = SelectEntity(subSets)
                        mid1 = entity1ToFreebaseId.get(entNumber)
                        mid2 = entity2ToFreebaseId.get(entNumber)
                        if mid1 == None:
                            mid1 = ''
                        if mid2 == None:
                            mid2 = ''   
                        ent_flag = oneOrTwo(ent1List[entNumber])
                        if ent_flag:
                            fb1,filterEnt1 = JKFilterForNoun(ent1List[entNumber],0)
                            fb2,filterEnt2 = JKFilterForNoun(ent2List[entNumber],1)
                        else:
                            fb1,filterEnt1 = JKFilterForNoun(ent1List[entNumber],1)
                            fb2,filterEnt2 = JKFilterForNoun(ent2List[entNumber],0)
##                        print "sent ", filterEnt1
##                        print "sent ", filterEnt2
                        filterEnt1 = filterEnt1.strip()
                        filterEnt2 = filterEnt2.strip()
                        if (filterEnt1.lower()).count(entSearch.lower()) > 1:
                            filterEnt1 = entSearch
                        if (filterEnt2.lower()).count(entSearch.lower()) > 1:
                            filterEnt2 = entSearch

                        isPrimaryEnt1 = searchPrimaryEntity(filterEnt1)
                        isPrimaryEnt2 = searchPrimaryEntity(filterEnt2)

##                        if fb1:
##                            fb1 = checkValidNoun(filterEnt1)
##                        if fb2:
##                            fb2 = checkValidNoun(filterEnt2)
                        
                        if(fb1 and fb2 and len(filterEnt1)>0 and len(filterEnt2)>0 and (isPrimaryEnt1 or isPrimaryEnt2)):
                            if not (filterEnt1 + " " + relList[entNumber] + " " + filterEnt2 in outputLine):
                                outputEntityList.append([filterEnt1,relList[entNumber],filterEnt2,probList[entNumber]])
                                outputLine.add(filterEnt1 + " " + relList[entNumber] + " " + filterEnt2)
    #fw = open('extractions/'+ entSearch+'/data/output/'+entSearch+'outputEnt.csv', 'w')
    #fileWriter = csv.writer(fw)
    #fileWriter.writerows(outputEntityList)
    #fw.close()
    oldVal = final_col.find_one({'primaryEnt':entSearch})
    if oldVal == None:
        final_col.insert_one({'primaryEnt':entSearch,'final-triples':outputEntityList})
    else:
        d = {'primaryEnt':entSearch,'final-triples':outputEntityList}
        final_col.replace_one({'primaryEnt':entSearch},d,True)
    
    fe_db.client.close()
def mapEtractionsToNell(line, entSearch):
    dbObj = mdb.mongodbDatabase('ent_type_collection')
    col = dbObj.docCollection
    nellExt = mdb.mongodbDatabase('map_collection')
    mapcol = nellExt.docCollection

    outputEntityList = []
    ent1 = line[0].strip()
    ent2 = line[2].strip()
    rel = line[1].strip()
    
    ent1type = getType(ent1.lower())
    ent2type = getType(ent2.lower())
    # print ent1, ent1type
    # print ent2, ent2type
    ent1type_hier = getTypeHierarchy(ent1type)
    ent2type_hier = getTypeHierarchy(ent2type)
    
    nellRelSet, freqDict = getRelation(rel, ent1type_hier, ent2type_hier)
    setDictList = [nellRelSet,freqDict]
    
    entType = 0
    relType = 0
    if ent1.lower() in entSearch or entSearch in ent1.lower():
        val = col.find_one({'ent':ent2.lower()})
        if val != None:
            entType = 1
        else:
            entType = 2
    else:
        val = col.find_one({'ent':ent1.lower()})
        if val != None:
            entType = 1
        else:
            entType = 2
    
##    print line
##    print ent1.lower(), "-->", ent1type_hier
##    print ent2.lower(), "-->", ent2type_hier
    if len(nellRelSet) == 0:
        relType = 2
    else:
        relType = 1
    
    newFact = 1
    isnew = mapcol.find({'ent1':ent1})
    if isnew != None:
        for facts in isnew:
            nelRel = facts.get('rel')
            nelEnt2 = facts.get('ent2')
            if nelRel == rel and nelEnt2==ent2:
                newFact = 0
    fact = ent1 + " " + rel
    outputEntityList.append(ent1)
    outputEntityList.append(rel)

##    for nr in nellRelSet:
##        #print rel, " --type-- ", nr
##        outputEntityList.append(nr)
    
    mx = 0
    nellRel = ''
    for nr in freqDict.keys():
        count = freqDict.get(nr)
        if count > mx:
            mx = count
            nellRel = nr
    if nellRel != '':
        outputEntityList.append(nellRel)
##    
    outputEntityList.append(ent2)
    fact += " "+ent2
    if relType == 1 and entType == 1:
        extType = 1
    elif relType == 1 and entType == 2:
        extType = 2
    elif relType == 2 and entType == 1:
        extType = 3
    elif relType == 2 and entType == 2:
        extType = 4
    
    outputEntityList.append(extType)
##    if newFact == 1:
##        outputEntityList.append('new')
    return outputEntityList