def calculate_similarity_equalWeights_duplicate(self,doc): eDisDic = self.entities['Topic'] locToks = self.entities['LOCATION'].keys() locToks = eventUtils.getStemmedWords(locToks) locDic = dict(zip(locToks,self.entities['LOCATION'].values())) dToks = self.entities['DATE'].keys() dToks = eventUtils.getStemmedWords(dToks) dDic = dict(zip(dToks,self.entities['DATE'].values())) tokens = eventUtils.getTokens(doc) tokensDic = eventUtils.getFreq(tokens) wv = [1+math.log(e) for e in tokensDic.values()] wvScalar = self.getScalar(wv) scores = [] ksd = 0 #interst = 0 for i in tokensDic: if i in eDisDic: ksd += (1+math.log(eDisDic[i]))* (1+math.log(tokensDic[i])) #interst +=1 if ksd > 0: ksd = float(ksd)/(self.scalars['Topic'] * wvScalar) else: ksd = 0 if ksd == 0: return 0 #if interst < 2: #return 0 scores.append(ksd) ksl = 0 for i in tokensDic: if i in locDic: ksl += (1+math.log(locDic[i]))* (1+math.log(tokensDic[i])) if ksl > 0: ksl = float(ksl)/(self.scalars['LOCATION'] * wvScalar) else: ksl = 0 scores.append(ksl) ks = 0 for i in tokensDic: if i in dDic: ks += (1+math.log(dDic[i]))* (1+math.log(tokensDic[i])) if ks > 0: ks = float(ks)/(self.scalars['DATE'] * wvScalar) else: ks = 0 scores.append(ks) score = sum(scores) / 3.0 return score
def calculate_similarity(self, doc): eDisDic = self.entities["Disaster"] locToks = self.entities["LOCATION"].keys() locToks = eventUtils.getStemmedWords(locToks) locDic = dict(zip(locToks, self.entities["LOCATION"].values())) dToks = self.entities["DATE"].keys() dToks = eventUtils.getStemmedWords(dToks) dDic = dict(zip(dToks, self.entities["DATE"].values())) tokens = eventUtils.getTokens(doc) tokensDic = eventUtils.getFreq(tokens) wv = [1 + math.log(e) for e in tokensDic.values()] wvScalar = self.getScalar(wv) scores = [] ksd = 0 for i in tokensDic: if i in eDisDic: ksd += (1 + math.log(eDisDic[i])) * (1 + math.log(tokensDic[i])) if ksd > 0: ksd = float(ksd) / (self.scalars["Disaster"] * wvScalar) else: ksd = 0 if ksd == 0: return 0 scores.append(ksd) ksl = 0 for i in tokensDic: if i in locDic: ksl += (1 + math.log(locDic[i])) * (1 + math.log(tokensDic[i])) if ksl > 0: ksl = float(ksl) / (self.scalars["LOCATION"] * wvScalar) else: ksl = 0 scores.append(ksl) ks = 0 for i in tokensDic: if i in dDic: ks += (1 + math.log(dDic[i])) * (1 + math.log(tokensDic[i])) if ks > 0: ks = float(ks) / (self.scalars["DATE"] * wvScalar) else: ks = 0 scores.append(ks) score = sum(scores) / 3.0 return score
def buildProbEventModel(self,urlsList,topK): docsList = eventUtils.getWebpageText(urlsList) #self.getCollectionDocs(urlsList) t = '' #docsTotalFreqs=[] docsEntities=[] docsEntitiesFreq = [] entitiesProb = {} # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions # Also produces the total frequency for each document of each list (tokens, locations, and dates) for doc in docsList: if doc.has_key('text'): t = doc['text'] if doc.has_key('title'): t =doc['title']+ " "+t if t: #print 'Reading ' + t[:100] ents = eventUtils.getEntities(t)[0] docEnt = {} docEnt['LOCATION']={} if 'LOCATION' in ents: docEnt['LOCATION'] = ents['LOCATION'] docEnt['DATE']={} if 'DATE' in ents: docEnt['DATE'] = ents['DATE'] toks = eventUtils.getTokens(t) docEnt['Topic'] = toks docsEntities.append(docEnt) docEntFreq = {} #docTotals = {} for k in docEnt: docEntFreq[k] = eventUtils.getFreq(docEnt[k]) #totalFreq = sum([v for _,v in docEntFreq[k].items()]) #docTotals[k] = totalFreq docsEntitiesFreq.append(docEntFreq) #docsTotalFreqs.append(docTotals) # Collection-level frequency for each entity(tokens, locations, dates) #Calculating prob for each item in each entity lists (tokens, locations, and dates) as # freq of item in all docs / total freq of all terms in that list entitiesProb['LOCATION']={} entitiesProb['DATE']={} entitiesProb['Topic']={} for docEntFreq in docsEntitiesFreq: for entity in docEntFreq: for val in docEntFreq[entity]: if val in entitiesProb[entity]: entitiesProb[entity][val] += docEntFreq[entity][val] else: entitiesProb[entity][val] = docEntFreq[entity][val] for ent in entitiesProb: allvalsFreq = sum([v for _,v in entitiesProb[ent].items()]) for k in entitiesProb[ent]: #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent]) entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (len(entitiesProb[ent]) + allvalsFreq) #self.probEvtModel = entitiesProb mle = self.getMLEEventEntities(entitiesProb,10) for k in mle: print k, mle[k] self.probEvtModel = {} for k in mle: self.probEvtModel[k] = dict(mle[k])#entitiesProb[k][:topK] self.eDisDic = self.probEvtModel['Topic'] locToks = self.probEvtModel['LOCATION'].keys() locToks = eventUtils.getStemmedWords(locToks) self.locDic = dict(zip(locToks,self.probEvtModel['LOCATION'].values())) dToks = self.probEvtModel['DATE'].keys() dToks = eventUtils.getStemmedWords(dToks) self.dDic = dict(zip(dToks,self.probEvtModel['DATE'].values())) return docsEntities, entitiesProb
def calculate_similarity(self,doc): #weigths ={'Topic':0.0,'LOCATION':0.0, 'DATE':0.0} ''' entFreq = {} for k in self.entities: entFreq[k]= sum(self.entities[k].values()) totFreq = sum(entFreq.values()) for k in weigths: weigths[k] = entFreq[k]*1.0 / totFreq ''' topicDic = self.entities['Topic'] locToks = self.entities['LOCATION'].keys() locToks = eventUtils.getStemmedWords(locToks) locDic = dict(zip(locToks,self.entities['LOCATION'].values())) dToks = self.entities['DATE'].keys() dToks = eventUtils.getStemmedWords(dToks) dDic = dict(zip(dToks,self.entities['DATE'].values())) tokens = eventUtils.getTokens(doc) tokensDic = eventUtils.getFreq(tokens) wv = [1+math.log(e) for e in tokensDic.values()] wvScalar = self.getScalar(wv) scores = [] ksd = 0 #interst = 0 for i in tokensDic: if i in topicDic: ksd += (1+math.log(topicDic[i]))* (1+math.log(tokensDic[i])) #interst +=1 if ksd != 0: ksd = float(ksd)/(self.scalars['Topic'] * wvScalar) #else: # ksd = 0 #if ksd == 0: # return 0 #if interst < 2: #return 0 scores.append(ksd*self.weights['Topic']) ksl = 0 for i in tokensDic: if i in locDic: ksl += (1+math.log(locDic[i]))* (1+math.log(tokensDic[i])) if ksl != 0: ksl = float(ksl)/(self.scalars['LOCATION'] * wvScalar) #else: # ksl = 0 scores.append(ksl*self.weights['LOCATION']) ks = 0 for i in tokensDic: if i in dDic: ks += (1+math.log(dDic[i]))* (1+math.log(tokensDic[i])) if ks != 0: ks = float(ks)/(self.scalars['DATE'] * wvScalar) #else: # ks = 0 scores.append(ks*self.weights['DATE']) #score = sum(scores) / 3.0 score = sum(scores) return score
def buildProbEventModel(self, urlsList, topK): docsList = eventUtils.getWebpageText_NoURLs(urlsList) #getWebpageText docsList = [d for d in docsList if 'text' in d] t = '' #docsTotalFreqs=[] docsEntities = [] docsEntitiesFreq = [] entitiesFreq = {} # Convert each doc to tokens, locations, dates lists and their corresponding frequency distributions # Also produces the total frequency for each document of each list (tokens, locations, and dates) for doc in docsList: #t = "" #if doc.has_key('text'): t = doc['text'] #if doc.has_key('title'): # t =doc['title']+ " "+t #if t: #print 'Reading ' + t[:100] ents = eventUtils.getEntities(t)[0] docEnt = {} docEnt['LOCATION'] = {} if 'LOCATION' in ents: docEnt['LOCATION'] = ents['LOCATION'] docEnt['DATE'] = {} if 'DATE' in ents: docEnt['DATE'] = ents['DATE'] toks = eventUtils.getTokens(t) docEnt['Topic'] = toks docsEntities.append(docEnt) docEntFreq = {} #docTotals = {} for k in docEnt: docEntFreq[k] = eventUtils.getFreq(docEnt[k]) #totalFreq = sum([v for _,v in docEntFreq[k].items()]) #docTotals[k] = totalFreq docsEntitiesFreq.append(docEntFreq) #docsTotalFreqs.append(docTotals) # Collection-level frequency for each entity(tokens, locations, dates) #Calculating prob for each item in each entity lists (tokens, locations, and dates) as # freq of item in all docs / total freq of all terms in that list entitiesFreq['LOCATION'] = defaultdict(float) #{} entitiesFreq['DATE'] = defaultdict(float) #{} entitiesFreq['Topic'] = defaultdict(float) #{} for docEntFreq in docsEntitiesFreq: for entity in docEntFreq: for val in docEntFreq[entity]: #if val in entitiesProb[entity]: entitiesFreq[entity][val] += docEntFreq[entity][val] #else: # entitiesProb[entity][val] = docEntFreq[entity][val] self.defaultProb = {} entitiesProb = {} for ent in entitiesFreq: allvalsFreq = sum([v for _, v in entitiesFreq[ent].items()]) l = len(entitiesFreq[ent]) denom = l + allvalsFreq self.defaultProb[ent] = 1.0 / denom entitiesProb[ent] = defaultdict(lambda: 1.0 / denom) for k in entitiesFreq[ent]: #entitiesProb[ent][k] = (1.0 + (entitiesProb[ent][k] *1.0)) / (docsTotalFreqs[ent] + allDocsTotal[ent]) entitiesProb[ent][k] = ( 1.0 + entitiesProb[ent][k]) / denom #(l + allvalsFreq) #self.probEvtModel = entitiesProb mle = self.getMLEEventEntities(entitiesProb, 10) for k in mle: print k, mle[k] self.probEvtModel = {} for k in mle: #self.probEvtModel[k] = dict(mle[k])#entitiesProb[k][:topK] self.probEvtModel[k] = defaultdict(lambda: self.defaultProb[k]) for e, v in mle[k]: self.probEvtModel[k][e] = v #self.eDisDic = self.probEvtModel['Topic'] locToks = self.probEvtModel['LOCATION'].keys() locToks = eventUtils.getStemmedWords(locToks) #self.locDic = dict(zip(locToks,self.probEvtModel['LOCATION'].values())) locDic = defaultdict(lambda: self.defaultProb['LOCATION']) for k, v in zip(locToks, self.probEvtModel['LOCATION'].values()): locDic[k] = v self.probEvtModel['LOCATION'] = locDic dToks = self.probEvtModel['DATE'].keys() dToks = eventUtils.getStemmedWords(dToks) #self.dDic = dict(zip(dToks,self.probEvtModel['DATE'].values())) dDic = defaultdict(lambda: self.defaultProb['DATE']) for k, v in zip(locToks, self.probEvtModel['DATE'].values()): dDic[k] = v self.probEvtModel['DATE'] = dDic return docsEntities, entitiesProb