def getIndicativeSentences(self,topK,intersectionTh): if len(self.indicativeSentences) > 0: return self.indicativeSentences else: topToksTuples = self.indicativeWords[:topK] topToks = [k for k,_ in topToksTuples] for d in self.documents: sents = d.getSentences() self.sentences.extend(sents) impSents ={} for sent in self.sentences: if sent not in impSents: sentToks = utils.getTokens(sent) if len(sentToks) > 100: continue intersect = utils.getIntersection(topToks, sentToks) if len(intersect) > intersectionTh: impSents[sent] = len(intersect) #if sent not in impSentsF: # impSentsF[sent] = len(intersect) #allImptSents.append(impSents) self.indicativeSentences = utils.getSorted(impSents.items(),1) return self.indicativeSentences
def getIndicativeSentences(self,topK,intersectionTh): if len(self.indicativeSentences) > 0: return self.indicativeSentences else: topToksTuples = self.indicativeWords[:topK] #topToksTuples = self.indicativeWords topToks = [k for k,_ in topToksTuples] for d in self.documents: sents = d.getSentences() if sents and len(sents)>0: self.sentences.extend(sents) impSents ={} for sent in self.sentences: if sent not in impSents: sentToks = utils.getTokens(sent) if len(sentToks) > 100: continue intersect = utils.getIntersection(topToks, sentToks) if len(intersect) > intersectionTh: #impSents[sent] = len(intersect) impSents[sent] = intersect #print intersect #if sent not in impSentsF: # impSentsF[sent] = len(intersect) #allImptSents.append(impSents) if impSents: #self.indicativeSentences = utils.getSorted(impSents.items(),1) self.indicativeSentences = sorted(impSents.items(),key=lambda x: len(x[1]), reverse=True) #sortedToksTFDF = sorted(toksTFDF.items(), key=lambda x: x[1][0], reverse=True) return self.indicativeSentences
def extractWebpageEventModel(self, text): webpageEventModel = {} entities = self.webpageEntities(text) if len(entities) > 1: for sent in entities: dictval = sent[1] for k in dictval: if k in ["LOCATION","Disaster","DATE"]: if webpageEventModel.has_key(k): webpageEventModel[k].extend(dictval[k]) else: webpageEventModel[k] = [] webpageEventModel[k].extend(dictval[k]) for k in webpageEventModel: if k in ["LOCATION","DATE"]: webpageEventModel[k] = dict(self.getEntitiesFreq(webpageEventModel[k])) webpageToks = eventUtils.getTokens(text) webpageDis = set(webpageEventModel['Disaster']) webpageDisDic = {} for wd in webpageDis: webpageDisDic[wd]=webpageToks.count(wd) webpageEventModel['Disaster']=webpageDisDic return webpageEventModel
def getWords(self): if self.words: return self.words else: r = utils.getTokens(self.text) if r: self.words = r return self.words
def calculate_similarity_equalWeights_duplicate(self,doc): eDisDic = self.entities['Topic'] locToks = self.entities['LOCATION'].keys() locToks = eventUtils.getStemmedWords(locToks) locDic = dict(zip(locToks,self.entities['LOCATION'].values())) dToks = self.entities['DATE'].keys() dToks = eventUtils.getStemmedWords(dToks) dDic = dict(zip(dToks,self.entities['DATE'].values())) tokens = eventUtils.getTokens(doc) tokensDic = eventUtils.getFreq(tokens) wv = [1+math.log(e) for e in tokensDic.values()] wvScalar = self.getScalar(wv) scores = [] ksd = 0 #interst = 0 for i in tokensDic: if i in eDisDic: ksd += (1+math.log(eDisDic[i]))* (1+math.log(tokensDic[i])) #interst +=1 if ksd > 0: ksd = float(ksd)/(self.scalars['Topic'] * wvScalar) else: ksd = 0 if ksd == 0: return 0 #if interst < 2: #return 0 scores.append(ksd) ksl = 0 for i in tokensDic: if i in locDic: ksl += (1+math.log(locDic[i]))* (1+math.log(tokensDic[i])) if ksl > 0: ksl = float(ksl)/(self.scalars['LOCATION'] * wvScalar) else: ksl = 0 scores.append(ksl) ks = 0 for i in tokensDic: if i in dDic: ks += (1+math.log(dDic[i]))* (1+math.log(tokensDic[i])) if ks > 0: ks = float(ks)/(self.scalars['DATE'] * wvScalar) else: ks = 0 scores.append(ks) score = sum(scores) / 3.0 return score
def getWords(self): if self.words: return self.words else: r = utils.getTokens(self.text) if len(r)>0: self.words = [w for w in r] return self.words else: return []
def calculate_score(self, doc): #sims=[] docWords = getTokens(doc) docTF = getFreq(docWords) sim = self.cosSim( docTF) if sim >= self.relevanceth: return [1,sim] else: return [0,sim]
def calculate_similarity(self,doc): ''' eDisDic = self.probEvtModel['Topic'] if self.locDic ==[]: locToks = self.probEvtModel['LOCATION'].keys() locToks = eventUtils.getStemmedWords(locToks) self.locDic = dict(zip(locToks,self.probEvtModel['LOCATION'].values())) if self.dDic == []: dToks = self.probEvtModel['DATE'].keys() dToks = eventUtils.getStemmedWords(dToks) self.dDic = dict(zip(dToks,self.probEvtModel['DATE'].values())) ''' tokens = eventUtils.getTokens(doc) docProb = {} #tokensDic = eventUtils.getFreq(tokens) #wv = [1+math.log(e) for e in tokensDic.values()] docProb['Topic'] = {} total = 0.0 for t in tokens: if t in self.eDisDic: p = self.eDisDic[t] total = total + math.log(p) if total == 0.0: return -100 docProb['Topic']['Total'] = total docProb['LOCATION'] = {} total = 0.0 for t in tokens: if t in self.locDic: p = self.locDic[t] total = total + math.log(p) docProb['LOCATION']['Total'] = total docProb['DATE'] = {} total = 0.0 for t in tokens: if t in self.dDic: p = self.dDic[t] total = total + math.log(p) docProb['DATE']['Total'] = total #finalDocProb = 1 finalDocProb = 0.0 for k in docProb: #finalDocProb = finalDocProb * docProb[k]['Total'] finalDocProb = finalDocProb + docProb[k]['Total'] docProb['Total'] = finalDocProb #if finalDocProb == 0.0: # finalDocProb = -100.0 return finalDocProb*-1
def calculate_similarity(self, doc): eDisDic = self.entities["Disaster"] locToks = self.entities["LOCATION"].keys() locToks = eventUtils.getStemmedWords(locToks) locDic = dict(zip(locToks, self.entities["LOCATION"].values())) dToks = self.entities["DATE"].keys() dToks = eventUtils.getStemmedWords(dToks) dDic = dict(zip(dToks, self.entities["DATE"].values())) tokens = eventUtils.getTokens(doc) tokensDic = eventUtils.getFreq(tokens) wv = [1 + math.log(e) for e in tokensDic.values()] wvScalar = self.getScalar(wv) scores = [] ksd = 0 for i in tokensDic: if i in eDisDic: ksd += (1 + math.log(eDisDic[i])) * (1 + math.log(tokensDic[i])) if ksd > 0: ksd = float(ksd) / (self.scalars["Disaster"] * wvScalar) else: ksd = 0 if ksd == 0: return 0 scores.append(ksd) ksl = 0 for i in tokensDic: if i in locDic: ksl += (1 + math.log(locDic[i])) * (1 + math.log(tokensDic[i])) if ksl > 0: ksl = float(ksl) / (self.scalars["LOCATION"] * wvScalar) else: ksl = 0 scores.append(ksl) ks = 0 for i in tokensDic: if i in dDic: ks += (1 + math.log(dDic[i])) * (1 + math.log(tokensDic[i])) if ks > 0: ks = float(ks) / (self.scalars["DATE"] * wvScalar) else: ks = 0 scores.append(ks) score = sum(scores) / 3.0 return score
def calculate_score(self,doc,m): #docScore = 0.0 if m == 'W': docEnt = eventUtils.getEntities(doc)[0] docEnt['Topic'] = eventUtils.getTokens(doc) score = self.getDocProb(docEnt) else: score = self.calculate_similarity(doc) return score
def calculate_score_AllDocs(self, doc): sims=[] docWords = getTokens(doc) docTF = getFreq(docWords) ndocTF = dict.fromkeys(self.topVocabDic) for k in ndocTF: if k in docTF: ndocTF[k] = docTF[k] else: ndocTF[k] = 1/math.e for dTF in self.docsTF: s = self.cosSim(ndocTF, dTF) sims.append(s) sim = max(sims) if sim >= self.relevanceth: return [1,sim] else: return [0,sim]
def webpageEntities(self,docText=""): disasters=set(self.entities["Disaster"].keys()) sentences = eventUtils.getSentences(docText) webpageEnts =[] for sent in sentences: sentToks = eventUtils.getTokens(sent) if len(sentToks) > 100: continue intersect = eventUtils.getIntersection(disasters, sentToks) if len(intersect) > self.intersectionTh: #print intersect sentEnts = eventUtils.getEntities(sent)[0] if sentEnts.has_key('LOCATION') or sentEnts.has_key('DATE'): sentEnts['Disaster'] = intersect webpageEnts.append((sent,sentEnts)) return webpageEnts
def getEM_Sents(wps): docsEntities=[] docsEntitiesFreq = [] entitiesProb = {} collSents = [] #for i,wp in enumerate(wps): for wp in wps: if 'text' not in wp: continue wpContent = wp['text']+wp['title'] wpSplit = wpContent.split('\n') wpFiltered = filter(None,wpSplit) wpContentf = '\n'.join(wpFiltered) sents = eventUtils.getSentences(wpContentf) collSents.append(sents) allSents = [] for sents in collSents: allSents.extend(sents) fw = eventUtils.getFreqTokens(allSents) fw = [w[0] for w in fw] #collFilteredSents = [] collEventModelInsts=[] for sents in collSents: filtEvtModelInsts = [] for s in sents: sentToks = eventUtils.getTokens(s) cw = eventUtils.getIntersection(fw, sentToks) if len(cw) >= 2: emi = {} emi['TOPIC'] = list(cw) ents = eventUtils.getEntities(s)[0] if ents.has_key('LOCATION'): emi['LOCATION'] = ents['LOCATION'] #filtEvtModelInsts.append(emi) if ents.has_key('DATE'): #emi['TOPIC'] = cw emi['DATE']=ents['DATE'] filtEvtModelInsts.append(emi) collEventModelInsts.append(filtEvtModelInsts) '''
def calculate_similarity_intersect(self,doc): #tokens = getTokenizedDoc(doc) tokens = eventUtils.getTokens(doc) doc_set = set(tokens) scores = [] for k in self.entities: entSet = set(self.entities[k].keys()) intersect = len(doc_set & entSet) union = len(doc_set | entSet) if k == "Disaster": if intersect == 0: return 0 score = intersect * 1.0 / union #len(self.entities[k]) scores.append(score) score = sum(scores)/3.0 return score
def webpageEntities_old(self,docText=""): disasters=self.entities["Disaster"] sentences = eventUtils.getSentences(docText) #impSentences = getIndicativeSents(sentences, disasters, len(disasters), 0) #impSentences = [] webpageEnts =[] for sent in sentences: sentToks = eventUtils.getTokens(sent) if len(sentToks) > 100: continue intersect = eventUtils.getIntersection(disasters, sentToks) if len(intersect) > self.intersectionTh: #impSentences.append(sent) sentEnts = eventUtils.getEntities(sent)[0] if sentEnts.has_key('LOCATION') or sentEnts.has_key('DATE'): sentEnts['Disaster'] = intersect webpageEnts.append((sent,sentEnts)) #entities = getEntities(impSentences) #webpageEnts = zip(impSentences,entities) return webpageEnts
def buildProbEventModel(docsList): t = '' docsTotalFreqs=[] docsEntities=[] docsEntitiesFreq = [] entitiesProb = {} # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions # Also produces the total frequency for each document of each list (tokens, locations, and dates) for doc in docsList: if doc.has_key('text'): t = doc['text'] if doc.has_key('title'): t =doc['title']+ " "+t if t: print 'Reading ' + t[:100] ents = eventUtils.getEntities(t)[0] docEnt = {} docEnt['LOCATION']={} if 'LOCATION' in ents: docEnt['LOCATION'] = ents['LOCATION'] docEnt['DATE']={} if 'DATE' in ents: docEnt['DATE'] = ents['DATE'] toks = eventUtils.getTokens(t) docEnt['Topic'] = toks docsEntities.append(docEnt) docEntFreq = {} #docTotals = {} for k in docEnt: docEntFreq[k] = eventUtils.getFreq(docEnt[k]) #totalFreq = sum([v for _,v in docEntFreq[k].items()]) #docTotals[k] = totalFreq docsEntitiesFreq.append(docEntFreq) #docsTotalFreqs.append(docTotals) # Collection-level frequency for each entity(tokens, locations, dates) # Total Frequency of keywords, locations, and dates in all documents ''' allDocsTotal = {} allDocsTotal['LOCATION'] = 0 allDocsTotal['DATE']=0 allDocsTotal['Topic'] = 0 for docTotFreq in docsTotalFreqs: for k in docTotFreq: allDocsTotal[k]+= docTotFreq[k] ''' #Calculating prob for each item in each entity lists (tokens, locations, and dates) as # freq of item in all docs / total freq of all terms in that list entitiesProb['LOCATION']={} entitiesProb['DATE']={} entitiesProb['Topic']={} for docEntFreq in docsEntitiesFreq: for entity in docEntFreq: for val in docEntFreq[entity]: if val in entitiesProb[entity]: entitiesProb[entity][val] += docEntFreq[entity][val] else: entitiesProb[entity][val] = docEntFreq[entity][val] for ent in entitiesProb: allvalsFreq = sum([v for _,v in entitiesProb[ent].items()]) for k in entitiesProb[ent]: #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent]) entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (len(entitiesProb[ent]) + allvalsFreq) return docsEntities, entitiesProb
def buildEventModel_old(self, seedURLs): corpus = Collection(seedURLs) #sortedTokensFreqs = corpus.getWordsFrequencies() sortedToksTFDF = corpus.getIndicativeWords() print sortedToksTFDF sortedImptSents = corpus.getIndicativeSentences( self.topK, self.intersectionTh) # Get Event Model eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents) #topToks = [k for k,_ in sortedToksTFDF] #if self.topK < len(topToks): # topToks = topToks[:self.topK] #self.entities['Disaster'] = set(topToks) self.entities['LOCATION'] = [] self.entities['DATE'] = [] for e in eventModelInstances: if 'LOCATION' in e: self.entities['LOCATION'].extend(e['LOCATION']) elif 'DATE' in e: self.entities['DATE'].extend(e['DATE']) entitiesFreq = {} entitiesFreq['LOCATION'] = eventUtils.getFreq( self.entities['LOCATION']) entitiesFreq['LOCATION'] = eventUtils.getSorted( entitiesFreq['LOCATION'].items(), 1) entitiesFreq['DATE'] = eventUtils.getFreq(self.entities['DATE']) entitiesFreq['DATE'] = eventUtils.getSorted( entitiesFreq['DATE'].items(), 1) l = [k for k, _ in entitiesFreq['LOCATION']] if self.topK < len(l): #l = l[:self.topK] l = l[:3] self.entities['LOCATION'] = set(l) d = [k for k, _ in entitiesFreq['DATE']] if self.topK < len(d): #d = d[:self.topK] d = d[:3] self.entities['DATE'] = set(d) ''' locList = self.entities['LOCATION'] locSet = set(locList) self.entities['LOCATION'] = [l for l in locSet] ''' self.entities['LOCATION'] = self.getUniqueEntities( self.entities['LOCATION']) ''' dateList = self.entities['DATE'] dateSet = set(dateList) self.entities['DATE'] = [d for d in dateSet] ''' self.entities['DATE'] = self.getUniqueEntities(self.entities['DATE']) locDate = list(self.entities['LOCATION']) + list(self.entities['DATE']) locDate = eventUtils.getTokens(' '.join(locDate)) ntopToks = [] topToks = [k for k, _ in sortedToksTFDF] for tok in topToks: if tok not in locDate: ntopToks.append(tok) topToks = ntopToks if self.topK < len(topToks): topToks = topToks[:self.topK] self.entities['Disaster'] = set(topToks) self.allEntities = [] for k in self.entities: self.allEntities.extend(self.entities[k]) print self.allEntities
def buildEventModel(self, keywordsTh, seedURLs): corpus = Collection(seedURLs) #NoTFDF sortedToksTFDF = corpus.getIndicativeWords() self.toksTFDFDic = dict(sortedToksTFDF) #print sortedToksTFDF #sortedImptSents = corpus.getIndicativeSentences(self.topK,self.intersectionTh) sortedImptSents = corpus.getIndicativeSentences( keywordsTh, self.intersectionTh) # Get Event Model eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents) self.entities['LOCATION'] = [] self.entities['DATE'] = [] self.entities['Disaster'] = [] for e in eventModelInstances: if 'LOCATION' in e: self.entities['LOCATION'].extend(e['LOCATION']) elif 'DATE' in e: self.entities['DATE'].extend(e['DATE']) self.entities['Disaster'].extend(e['Disaster']) entitiesFreq = {} entitiesFreq['LOCATION'] = self.getEntitiesFreq( self.entities['LOCATION']) entitiesFreq['DATE'] = self.getEntitiesFreq(self.entities['DATE']) entitiesFreq['Disaster'] = self.getEntitiesFreq( self.entities['Disaster']) filteredDates = [] months = [ 'jan', 'feb', 'mar', 'apr', 'aug', 'sept', 'oct', 'nov', 'dec', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december' ] for d, v in entitiesFreq['DATE']: if d.isdigit() and len(d) == 4: filteredDates.append((d, v)) elif d.lower() in months: filteredDates.append((d, v)) entitiesFreq['DATE'] = filteredDates llen = 5 dlen = 5 #l = [k for k,_ in entitiesFreq['LOCATION']] s = len(entitiesFreq['LOCATION']) if llen < s: s = llen t = entitiesFreq['LOCATION'][:s] print t self.entities['LOCATION'] = dict(t) #d = [k for k,_ in entitiesFreq['DATE']] s = len(entitiesFreq['DATE']) if dlen < s: s = dlen self.entities['DATE'] = dict(entitiesFreq['DATE'][:s]) print entitiesFreq['DATE'][:s] locDate = [k for k, _ in entitiesFreq['LOCATION'] ] + [m for m, _ in entitiesFreq['DATE']] locDate = eventUtils.getTokens(' '.join(locDate)) ''' ntopToks = [] topToks = [k for k,_ in sortedToksTFDF] for tok in topToks: if tok not in locDate: ntopToks.append(tok) topToks = ntopToks if self.topK < len(topToks): topToks = topToks[:self.topK] ''' ntopToks = [] topToks = [k for k, _ in entitiesFreq['Disaster']] for tok in topToks: if tok not in locDate: ntopToks.append(tok) topToks = ntopToks if self.topK < len(topToks): topToks = topToks[:self.topK] #print "Disaster: ", topToks topToksDic = {} for t in topToks: topToksDic[t] = self.toksTFDFDic[t] #self.entities['Disaster'] = set(topToks) self.entities['Disaster'] = topToksDic #print self.entities print topToks #self.vecs = {} self.scalars = {} for k in self.entities: ekv = self.entities[k] ''' if k == 'Disaster': ev = [1+math.log(e*v) for e,v in ekv.values()] else: ev = [1+math.log(e) for e in ekv.values()] ''' #NoTFDF ev = [1 + math.log(e) for e in ekv.values()] #self.vecs[k] = ev self.scalars[k] = self.getScalar(ev)
def buildProbEventModel(docsList): t = '' docsTotalFreqs = [] docsEntities = [] docsEntitiesFreq = [] entitiesProb = {} # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions # Also produces the total frequency for each document of each list (tokens, locations, and dates) for doc in docsList: if doc.has_key('text'): t = doc['text'] if doc.has_key('title'): t = doc['title'] + " " + t if t: print 'Reading ' + t[:100] ents = eventUtils.getEntities(t)[0] docEnt = {} docEnt['LOCATION'] = {} if 'LOCATION' in ents: docEnt['LOCATION'] = ents['LOCATION'] docEnt['DATE'] = {} if 'DATE' in ents: docEnt['DATE'] = ents['DATE'] toks = eventUtils.getTokens(t) docEnt['Topic'] = toks docsEntities.append(docEnt) docEntFreq = {} #docTotals = {} for k in docEnt: docEntFreq[k] = eventUtils.getFreq(docEnt[k]) #totalFreq = sum([v for _,v in docEntFreq[k].items()]) #docTotals[k] = totalFreq docsEntitiesFreq.append(docEntFreq) #docsTotalFreqs.append(docTotals) # Collection-level frequency for each entity(tokens, locations, dates) # Total Frequency of keywords, locations, and dates in all documents ''' allDocsTotal = {} allDocsTotal['LOCATION'] = 0 allDocsTotal['DATE']=0 allDocsTotal['Topic'] = 0 for docTotFreq in docsTotalFreqs: for k in docTotFreq: allDocsTotal[k]+= docTotFreq[k] ''' #Calculating prob for each item in each entity lists (tokens, locations, and dates) as # freq of item in all docs / total freq of all terms in that list entitiesProb['LOCATION'] = {} entitiesProb['DATE'] = {} entitiesProb['Topic'] = {} for docEntFreq in docsEntitiesFreq: for entity in docEntFreq: for val in docEntFreq[entity]: if val in entitiesProb[entity]: entitiesProb[entity][val] += docEntFreq[entity][val] else: entitiesProb[entity][val] = docEntFreq[entity][val] for ent in entitiesProb: allvalsFreq = sum([v for _, v in entitiesProb[ent].items()]) for k in entitiesProb[ent]: #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent]) entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] * 1.0)) / ( len(entitiesProb[ent]) + allvalsFreq) return docsEntities, entitiesProb
def buildEventModel(self, keywordsTh, seedURLs): corpus = Collection(seedURLs) # NoTFDF sortedToksTFDF = corpus.getIndicativeWords() self.toksTFDFDic = dict(sortedToksTFDF) # print sortedToksTFDF # sortedImptSents = corpus.getIndicativeSentences(self.topK,self.intersectionTh) sortedImptSents = corpus.getIndicativeSentences(keywordsTh, self.intersectionTh) # Get Event Model eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents) self.entities["LOCATION"] = [] self.entities["DATE"] = [] self.entities["Disaster"] = [] for e in eventModelInstances: if "LOCATION" in e: self.entities["LOCATION"].extend(e["LOCATION"]) elif "DATE" in e: self.entities["DATE"].extend(e["DATE"]) self.entities["Disaster"].extend(e["Disaster"]) entitiesFreq = {} entitiesFreq["LOCATION"] = self.getEntitiesFreq(self.entities["LOCATION"]) entitiesFreq["DATE"] = self.getEntitiesFreq(self.entities["DATE"]) entitiesFreq["Disaster"] = self.getEntitiesFreq(self.entities["Disaster"]) filteredDates = [] months = [ "jan", "feb", "mar", "apr", "aug", "sept", "oct", "nov", "dec", "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december", ] for d, v in entitiesFreq["DATE"]: if d.isdigit() and len(d) == 4: filteredDates.append((d, v)) elif d.lower() in months: filteredDates.append((d, v)) entitiesFreq["DATE"] = filteredDates llen = 5 dlen = 5 # l = [k for k,_ in entitiesFreq['LOCATION']] s = len(entitiesFreq["LOCATION"]) if llen < s: s = llen t = entitiesFreq["LOCATION"][:s] print t self.entities["LOCATION"] = dict(t) # d = [k for k,_ in entitiesFreq['DATE']] s = len(entitiesFreq["DATE"]) if dlen < s: s = dlen self.entities["DATE"] = dict(entitiesFreq["DATE"][:s]) print entitiesFreq["DATE"][:s] locDate = [k for k, _ in entitiesFreq["LOCATION"]] + [m for m, _ in entitiesFreq["DATE"]] locDate = eventUtils.getTokens(" ".join(locDate)) """ ntopToks = [] topToks = [k for k,_ in sortedToksTFDF] for tok in topToks: if tok not in locDate: ntopToks.append(tok) topToks = ntopToks if self.topK < len(topToks): topToks = topToks[:self.topK] """ ntopToks = [] topToks = [k for k, _ in entitiesFreq["Disaster"]] for tok in topToks: if tok not in locDate: ntopToks.append(tok) topToks = ntopToks if self.topK < len(topToks): topToks = topToks[: self.topK] # print "Disaster: ", topToks topToksDic = {} for t in topToks: topToksDic[t] = self.toksTFDFDic[t] # self.entities['Disaster'] = set(topToks) self.entities["Disaster"] = topToksDic # print self.entities print topToks # self.vecs = {} self.scalars = {} for k in self.entities: ekv = self.entities[k] """ if k == 'Disaster': ev = [1+math.log(e*v) for e,v in ekv.values()] else: ev = [1+math.log(e) for e in ekv.values()] """ # NoTFDF ev = [1 + math.log(e) for e in ekv.values()] # self.vecs[k] = ev self.scalars[k] = self.getScalar(ev)
def buildEventModel(self, seedURLs): corpus = Collection(seedURLs) #NoTFDF corpus.getIndicativeWords('TF') self.toksDic= dict(corpus.indicativeWords) #self.toksTFIDFDic = dict(sortedToksTFIDF) #print sortedToksTFDF sortedImptSents = corpus.getIndicativeSentences(3 * self.topK,self.intersectionTh) #sortedImptSents = corpus.getIndicativeSentences(keywordsTh,self.intersectionTh) for s in sortedImptSents[:self.topK]: print s # Get Event Model eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents) #print eventModelInstances[:self.topK] self.entities['LOCATION']= [] self.entities['DATE'] = [] self.entities['Topic']=[] for e in eventModelInstances: if 'LOCATION' in e: self.entities['LOCATION'].extend( e['LOCATION']) if 'DATE' in e: self.entities['DATE'].extend( e['DATE']) self.entities['Topic'].extend(e['Topic']) entitiesFreq = {} entitiesFreq['LOCATION'] = self.getEntitiesFreq(self.entities['LOCATION']) entitiesFreq['DATE'] = self.getEntitiesFreq(self.entities['DATE']) entitiesFreq['Topic'] = self.getEntitiesFreq(self.entities['Topic']) #entitiesFreq['Topic'] = [(t,self.toksTFIDFDic[t]) for t,f in tf ] ''' if self.topK < len(entitiesFreq['Topic']): entitiesFreq['Topic'] = entitiesFreq['Topic'][:self.topK] self.entities['Topic'] = dict(entitiesFreq['Topic']) print entitiesFreq['Topic'] ''' filteredDates = [] months = ['jan','feb','mar','apr','aug','sept','oct','nov','dec','january','february','march','april','may','june','july','august','september','october','november','december'] for d,v in entitiesFreq['DATE']: if d.isdigit() and len(d) == 4: filteredDates.append((d,v)) elif d.lower() in months: filteredDates.append((d,v)) entitiesFreq['DATE']=filteredDates llen = self.topK dlen = self.topK #l = [k for k,_ in entitiesFreq['LOCATION']] s = len(entitiesFreq['LOCATION']) if llen < s: s = llen t = entitiesFreq['LOCATION'][:s] print t self.entities['LOCATION'] = dict(t) #locDate = [k for k,_ in entitiesFreq['LOCATION']] + [m for m,_ in entitiesFreq['DATE']] locDate = self.entities['LOCATION'].keys() + [m for m,_ in entitiesFreq['DATE']]#self.entities['DATE'].keys() locDate = eventUtils.getTokens(' '.join(locDate)) #d = [k for k,_ in entitiesFreq['DATE']] s = len(entitiesFreq['DATE']) if dlen < s: s = dlen self.entities['DATE'] = dict(entitiesFreq['DATE'][:s]) print entitiesFreq['DATE'][:s] ntopToks = [] topToks = [k for k,_ in entitiesFreq['Topic']] for tok in topToks: if tok not in locDate: ntopToks.append(tok) topToks = ntopToks if self.topK < len(topToks): topToks = topToks[:self.topK] #print "Disaster: ", topToks topToksDic = {} for t in topToks: topToksDic[t] = self.toksDic[t] #self.entities['Disaster'] = set(topToks) self.entities['Topic'] = topToksDic #print self.entities print topToksDic #Calculate weights self.calculateWeights() newents = {} for k in self.entities: ed = self.entities[k].iteritems() ned = [(k,1) for k,_ in ed] newents[k] = dict(ned) for k in newents: self.entities[k] = newents[k] #self.vecs = {} self.scalars = {} for k in self.entities: ekv = self.entities[k] ''' if k == 'Disaster': ev = [1+math.log(e*v) for e,v in ekv.values()] else: ev = [1+math.log(e) for e in ekv.values()] ''' #NoTFDF ev = [1+math.log(e) for e in ekv.values()] #self.vecs[k] = ev self.scalars[k] = self.getScalar(ev)
def calculate_similarity(self,doc): #weigths ={'Topic':0.0,'LOCATION':0.0, 'DATE':0.0} ''' entFreq = {} for k in self.entities: entFreq[k]= sum(self.entities[k].values()) totFreq = sum(entFreq.values()) for k in weigths: weigths[k] = entFreq[k]*1.0 / totFreq ''' topicDic = self.entities['Topic'] locToks = self.entities['LOCATION'].keys() locToks = eventUtils.getStemmedWords(locToks) locDic = dict(zip(locToks,self.entities['LOCATION'].values())) dToks = self.entities['DATE'].keys() dToks = eventUtils.getStemmedWords(dToks) dDic = dict(zip(dToks,self.entities['DATE'].values())) tokens = eventUtils.getTokens(doc) tokensDic = eventUtils.getFreq(tokens) wv = [1+math.log(e) for e in tokensDic.values()] wvScalar = self.getScalar(wv) scores = [] ksd = 0 #interst = 0 for i in tokensDic: if i in topicDic: ksd += (1+math.log(topicDic[i]))* (1+math.log(tokensDic[i])) #interst +=1 if ksd != 0: ksd = float(ksd)/(self.scalars['Topic'] * wvScalar) #else: # ksd = 0 #if ksd == 0: # return 0 #if interst < 2: #return 0 scores.append(ksd*self.weights['Topic']) ksl = 0 for i in tokensDic: if i in locDic: ksl += (1+math.log(locDic[i]))* (1+math.log(tokensDic[i])) if ksl != 0: ksl = float(ksl)/(self.scalars['LOCATION'] * wvScalar) #else: # ksl = 0 scores.append(ksl*self.weights['LOCATION']) ks = 0 for i in tokensDic: if i in dDic: ks += (1+math.log(dDic[i]))* (1+math.log(tokensDic[i])) if ks != 0: ks = float(ks)/(self.scalars['DATE'] * wvScalar) #else: # ks = 0 scores.append(ks*self.weights['DATE']) #score = sum(scores) / 3.0 score = sum(scores) return score
def buildEventModel_old(self,seedURLs): corpus = Collection(seedURLs) #sortedTokensFreqs = corpus.getWordsFrequencies() sortedToksTFDF = corpus.getIndicativeWords() print sortedToksTFDF sortedImptSents = corpus.getIndicativeSentences(self.topK,self.intersectionTh) # Get Event Model eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents) #topToks = [k for k,_ in sortedToksTFDF] #if self.topK < len(topToks): # topToks = topToks[:self.topK] #self.entities['Disaster'] = set(topToks) self.entities['LOCATION']= [] self.entities['DATE'] = [] for e in eventModelInstances: if 'LOCATION' in e: self.entities['LOCATION'].extend( e['LOCATION']) elif 'DATE' in e: self.entities['DATE'].extend( e['DATE']) entitiesFreq = {} entitiesFreq['LOCATION'] = eventUtils.getFreq(self.entities['LOCATION']) entitiesFreq['LOCATION'] = eventUtils.getSorted(entitiesFreq['LOCATION'].items(), 1) entitiesFreq['DATE'] = eventUtils.getFreq(self.entities['DATE']) entitiesFreq['DATE'] = eventUtils.getSorted(entitiesFreq['DATE'].items(), 1) l = [k for k,_ in entitiesFreq['LOCATION']] if self.topK < len(l): #l = l[:self.topK] l = l[:3] self.entities['LOCATION'] = set(l) d = [k for k,_ in entitiesFreq['DATE']] if self.topK < len(d): #d = d[:self.topK] d = d[:3] self.entities['DATE'] = set(d) self.entities['LOCATION'] = self.getUniqueEntities(self.entities['LOCATION']) self.entities['DATE'] = self.getUniqueEntities(self.entities['DATE']) locDate = list(self.entities['LOCATION']) + list(self.entities['DATE']) locDate = eventUtils.getTokens(' '.join(locDate)) ntopToks = [] topToks = [k for k,_ in sortedToksTFDF] for tok in topToks: if tok not in locDate: ntopToks.append(tok) topToks = ntopToks if self.topK < len(topToks): topToks = topToks[:self.topK] self.entities['Disaster'] = set(topToks) self.allEntities = [] for k in self.entities: self.allEntities.extend(self.entities[k]) print self.allEntities
def buildEventModel(self,keywordsTh, seedURLs): corpus = Collection(seedURLs) #NoTFDF sortedToksTFDF = corpus.getIndicativeWords() self.toksTFDFDic = dict(sortedToksTFDF) print sortedToksTFDF #sortedImptSents = corpus.getIndicativeSentences(self.topK,self.intersectionTh) sortedImptSents = corpus.getIndicativeSentences(keywordsTh,self.intersectionTh) # Get Event Model eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents) self.entities['LOCATION']= [] self.entities['DATE'] = [] self.entities['Disaster']=[] for e in eventModelInstances: if 'LOCATION' in e: self.entities['LOCATION'].extend( e['LOCATION']) elif 'DATE' in e: self.entities['DATE'].extend( e['DATE']) self.entities['Disaster'].extend(e['Disaster']) entitiesFreq = {} entitiesFreq['LOCATION'] = self.getEntitiesFreq(self.entities['LOCATION']) entitiesFreq['DATE'] = self.getEntitiesFreq(self.entities['DATE']) entitiesFreq['Disaster'] = self.getEntitiesFreq(self.entities['Disaster']) filteredDates = [] months = ['january','february','march','april','may','june','july','august','september','october','november','december'] for d,v in entitiesFreq['DATE']: if d.isdigit() and len(d) == 4: filteredDates.append((d,v)) elif d.lower() in months: filteredDates.append((d,v)) entitiesFreq['DATE']=filteredDates llen = 5 dlen = 5 #l = [k for k,_ in entitiesFreq['LOCATION']] s = len(entitiesFreq['LOCATION']) if llen < s: s = llen t = entitiesFreq['LOCATION'][:s] print t self.entities['LOCATION'] = dict(t) #d = [k for k,_ in entitiesFreq['DATE']] s = len(entitiesFreq['DATE']) if dlen < s: s = dlen self.entities['DATE'] = dict(entitiesFreq['DATE'][:s]) print entitiesFreq['DATE'][:s] locDate = [k for k,_ in entitiesFreq['LOCATION'][:2]] + [m for m,_ in entitiesFreq['DATE']] locDate = eventUtils.getTokens(' '.join(locDate)) ntopToks = [] topToks = [k for k,_ in entitiesFreq['Disaster']] for tok in topToks: if tok not in locDate: ntopToks.append(tok) topToks = ntopToks if self.topK < len(topToks): topToks = topToks[:self.topK] #print "Disaster: ", topToks topToksDic = {} for t in topToks: topToksDic[t] = self.toksTFDFDic[t] #self.entities['Disaster'] = set(topToks) self.entities['Disaster'] = topToksDic #print self.entities print topToks #self.vecs = {} self.scalars = {} for k in self.entities: ekv = self.entities[k] #NoTFDF ev = [1+math.log(e) for e in ekv.values()] #self.vecs[k] = ev self.scalars[k] = self.getScalar(ev)
def buildProbEventModel(self,urlsList,topK): docsList = eventUtils.getWebpageText(urlsList) #self.getCollectionDocs(urlsList) t = '' #docsTotalFreqs=[] docsEntities=[] docsEntitiesFreq = [] entitiesProb = {} # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions # Also produces the total frequency for each document of each list (tokens, locations, and dates) for doc in docsList: if doc.has_key('text'): t = doc['text'] if doc.has_key('title'): t =doc['title']+ " "+t if t: #print 'Reading ' + t[:100] ents = eventUtils.getEntities(t)[0] docEnt = {} docEnt['LOCATION']={} if 'LOCATION' in ents: docEnt['LOCATION'] = ents['LOCATION'] docEnt['DATE']={} if 'DATE' in ents: docEnt['DATE'] = ents['DATE'] toks = eventUtils.getTokens(t) docEnt['Topic'] = toks docsEntities.append(docEnt) docEntFreq = {} #docTotals = {} for k in docEnt: docEntFreq[k] = eventUtils.getFreq(docEnt[k]) #totalFreq = sum([v for _,v in docEntFreq[k].items()]) #docTotals[k] = totalFreq docsEntitiesFreq.append(docEntFreq) #docsTotalFreqs.append(docTotals) # Collection-level frequency for each entity(tokens, locations, dates) #Calculating prob for each item in each entity lists (tokens, locations, and dates) as # freq of item in all docs / total freq of all terms in that list entitiesProb['LOCATION']={} entitiesProb['DATE']={} entitiesProb['Topic']={} for docEntFreq in docsEntitiesFreq: for entity in docEntFreq: for val in docEntFreq[entity]: if val in entitiesProb[entity]: entitiesProb[entity][val] += docEntFreq[entity][val] else: entitiesProb[entity][val] = docEntFreq[entity][val] for ent in entitiesProb: allvalsFreq = sum([v for _,v in entitiesProb[ent].items()]) for k in entitiesProb[ent]: #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent]) entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (len(entitiesProb[ent]) + allvalsFreq) #self.probEvtModel = entitiesProb mle = self.getMLEEventEntities(entitiesProb,10) for k in mle: print k, mle[k] self.probEvtModel = {} for k in mle: self.probEvtModel[k] = dict(mle[k])#entitiesProb[k][:topK] self.eDisDic = self.probEvtModel['Topic'] locToks = self.probEvtModel['LOCATION'].keys() locToks = eventUtils.getStemmedWords(locToks) self.locDic = dict(zip(locToks,self.probEvtModel['LOCATION'].values())) dToks = self.probEvtModel['DATE'].keys() dToks = eventUtils.getStemmedWords(dToks) self.dDic = dict(zip(dToks,self.probEvtModel['DATE'].values())) return docsEntities, entitiesProb
def buildProbEventModel(self, urlsList, topK): docsList = eventUtils.getWebpageText_NoURLs(urlsList) #getWebpageText docsList = [d for d in docsList if 'text' in d] t = '' #docsTotalFreqs=[] docsEntities = [] docsEntitiesFreq = [] entitiesFreq = {} # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions # Also produces the total frequency for each document of each list (tokens, locations, and dates) for doc in docsList: #t = "" #if doc.has_key('text'): t = doc['text'] #if doc.has_key('title'): # t =doc['title']+ " "+t #if t: #print 'Reading ' + t[:100] ents = eventUtils.getEntities(t)[0] docEnt = {} docEnt['LOCATION'] = {} if 'LOCATION' in ents: docEnt['LOCATION'] = ents['LOCATION'] docEnt['DATE'] = {} if 'DATE' in ents: docEnt['DATE'] = ents['DATE'] toks = eventUtils.getTokens(t) docEnt['Topic'] = toks docsEntities.append(docEnt) docEntFreq = {} #docTotals = {} for k in docEnt: docEntFreq[k] = eventUtils.getFreq(docEnt[k]) #totalFreq = sum([v for _,v in docEntFreq[k].items()]) #docTotals[k] = totalFreq docsEntitiesFreq.append(docEntFreq) #docsTotalFreqs.append(docTotals) # Collection-level frequency for each entity(tokens, locations, dates) #Calculating prob for each item in each entity lists (tokens, locations, and dates) as # freq of item in all docs / total freq of all terms in that list entitiesFreq['LOCATION'] = defaultdict(float) #{} entitiesFreq['DATE'] = defaultdict(float) #{} entitiesFreq['Topic'] = defaultdict(float) #{} for docEntFreq in docsEntitiesFreq: for entity in docEntFreq: for val in docEntFreq[entity]: #if val in entitiesProb[entity]: entitiesFreq[entity][val] += docEntFreq[entity][val] #else: # entitiesProb[entity][val] = docEntFreq[entity][val] self.defaultProb = {} entitiesProb = {} for ent in entitiesFreq: allvalsFreq = sum([v for _, v in entitiesFreq[ent].items()]) l = len(entitiesFreq[ent]) denom = l + allvalsFreq self.defaultProb[ent] = 1.0 / denom entitiesProb[ent] = defaultdict(lambda: 1.0 / denom) for k in entitiesFreq[ent]: #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent]) entitiesProb[ent][k] = ( 1.0 + entitiesProb[ent][k]) / denom #(l + allvalsFreq) #self.probEvtModel = entitiesProb mle = self.getMLEEventEntities(entitiesProb, 10) for k in mle: print k, mle[k] self.probEvtModel = {} for k in mle: #self.probEvtModel[k] = dict(mle[k])#entitiesProb[k][:topK] self.probEvtModel[k] = defaultdict(lambda: self.defaultProb[k]) for e, v in mle[k]: self.probEvtModel[k][e] = v #self.eDisDic = self.probEvtModel['Topic'] locToks = self.probEvtModel['LOCATION'].keys() locToks = eventUtils.getStemmedWords(locToks) #self.locDic = dict(zip(locToks,self.probEvtModel['LOCATION'].values())) locDic = defaultdict(lambda: self.defaultProb['LOCATION']) for k, v in zip(locToks, self.probEvtModel['LOCATION'].values()): locDic[k] = v self.probEvtModel['LOCATION'] = locDic dToks = self.probEvtModel['DATE'].keys() dToks = eventUtils.getStemmedWords(dToks) #self.dDic = dict(zip(dToks,self.probEvtModel['DATE'].values())) dDic = defaultdict(lambda: self.defaultProb['DATE']) for k, v in zip(locToks, self.probEvtModel['DATE'].values()): dDic[k] = v self.probEvtModel['DATE'] = dDic return docsEntities, entitiesProb