예제 #1
0
    def getIndicativeWords(self, t):
        if self.indicativeWords:
            return self.indicativeWords
        else:

            #toksTFDF = self.getWordsTFDF()
            #sortedToksTFDF = sorted(toksTFDF.items(), key=lambda x: x[1][0], reverse=True)
            #indWords = [w[0] for w in sortedToksTFDF]

            #wordsTags = utils.getPOS(indWords)
            #nvWords = [w[0] for w in wordsTags if w[1].startswith('N') or w[1].startswith('V')]
            #wordsDic = dict(sortedToksTFDF)
            #self.indicativeWords = [(w,wordsDic[w]) for w in nvWords]
            #-----

            #self.indicativeWords = self.getWordsFrequencies()

            if t == 'TFIDF':
                toks = self.getWordsTFIDF()
            elif t == 'TFDF':
                toks = self.getWordsTFDF()
            elif t == 'TF':
                toks = self.getWordsTF()
            self.indicativeWords = utils.getSorted(toks.items(), 1)
            #self.indicativeWords = toks
            return self.indicativeWords
예제 #2
0
    def getIndicativeWords(self,t):
        if self.indicativeWords:
            return self.indicativeWords
        else:
            
            #toksTFDF = self.getWordsTFDF()
            #sortedToksTFDF = sorted(toksTFDF.items(), key=lambda x: x[1][0], reverse=True)
            #indWords = [w[0] for w in sortedToksTFDF]
            
            #wordsTags = utils.getPOS(indWords)
            #nvWords = [w[0] for w in wordsTags if w[1].startswith('N') or w[1].startswith('V')]
            #wordsDic = dict(sortedToksTFDF)
            #self.indicativeWords = [(w,wordsDic[w]) for w in nvWords]
            #-----

            #self.indicativeWords = self.getWordsFrequencies()
            
            if t =='TFIDF':
                toks = self.getWordsTFIDF()
            elif t == 'TFDF':
                toks = self.getWordsTFDF()
            elif t == 'TF':
                toks = self.getWordsTF()
            self.indicativeWords = utils.getSorted(toks.items(),1)
            #self.indicativeWords = toks
            return self.indicativeWords
예제 #3
0
 def getIndicativeSentences(self,topK,intersectionTh):
     if len(self.indicativeSentences) > 0:
         return self.indicativeSentences
     else:
         topToksTuples = self.indicativeWords[:topK]
         topToks = [k for k,_ in topToksTuples]
         
         for d in self.documents:
             sents = d.getSentences()
             self.sentences.extend(sents)
         
         impSents ={}
         for sent in self.sentences:
             if sent not in impSents:
                 sentToks = utils.getTokens(sent)
                 if len(sentToks) > 100:
                     continue
                 intersect = utils.getIntersection(topToks, sentToks)
                 if len(intersect) > intersectionTh:
                     impSents[sent] = len(intersect)
                     #if sent not in impSentsF:
                     #    impSentsF[sent] = len(intersect)
                 #allImptSents.append(impSents)
         
         self.indicativeSentences = utils.getSorted(impSents.items(),1)
         return self.indicativeSentences
예제 #4
0
    def getIndicativeSentences(self, topK, intersectionTh):
        if len(self.indicativeSentences) > 0:
            return self.indicativeSentences
        else:
            topToksTuples = self.indicativeWords[:topK]
            topToks = [k for k, _ in topToksTuples]

            for d in self.documents:
                sents = d.getSentences()
                self.sentences.extend(sents)

            impSents = {}
            for sent in self.sentences:
                if sent not in impSents:
                    sentToks = utils.getTokens(sent)
                    if len(sentToks) > 100:
                        continue
                    intersect = utils.getIntersection(topToks, sentToks)
                    if len(intersect) > intersectionTh:
                        impSents[sent] = len(intersect)
                        #if sent not in impSentsF:
                        #    impSentsF[sent] = len(intersect)
                    #allImptSents.append(impSents)

            self.indicativeSentences = utils.getSorted(impSents.items(), 1)
            return self.indicativeSentences
예제 #5
0
 def getWordsFrequencies(self):
     for d in self.documents:
         w = d.getWords()
         self.words.extend(w)
     f = utils.getFreq(self.words)
     tokensFreqs = f.items()
     self.wordsFrequencies = utils.getSorted(tokensFreqs, 1)
     return self.wordsFrequencies
예제 #6
0
 def getWordsFrequencies(self):
     for d in self.documents:
         w = d.getWords()
         self.words.extend(w)
     f = utils.getFreq(self.words)
     tokensFreqs = f.items()
     self.wordsFrequencies = utils.getSorted(tokensFreqs,1)
     return self.wordsFrequencies
예제 #7
0
def saveSourcesFreqDic(sourcesFreqDic,filename):
    t = [(k, len(v),sum(v)) for k,v in sourcesFreqDic.items()]
    st = eventUtils.getSorted(t, 1)
    f= open(filename,'w')
    #for k,v in sourcesFreqDic.items():
    for k,l,s in st:
        #f.write(k +"," + str(len(v))+"," + str(sum(v))+"\n")
        f.write(k +"," + str(l)+"," + str(s)+"\n")
    f.close()
예제 #8
0
def getMLEEventEntities(probEventModel, topK):
    mleEnts = {}
    for k in probEventModel:
        d = probEventModel[k]
        ds = eventUtils.getSorted(d.items(), 1)
        if topK:
            mleEnts[k] = ds[:topK]
        else:
            mleEnts[k] = ds
    return mleEnts
예제 #9
0
def getMLEEventEntities(self,pem,topK):
    mleEnts = {}
    for k in pem:
        d = pem[k]
        ds = eventUtils.getSorted(d.items(), 1)
        if topK:
            mleEnts[k] = ds[:topK]
        else:
            mleEnts[k] = ds
    return mleEnts
예제 #10
0
def getMLEEventEntities(probEventModel,topK):
    mleEnts = {}
    for k in probEventModel:
        d = probEventModel[k]
        ds = eventUtils.getSorted(d.items(), 1)
        if topK:
            mleEnts[k] = ds[:topK]
        else:
            mleEnts[k] = ds
    return mleEnts
예제 #11
0
 def getMLEEventEntities(self,pem,topK):
     mleEnts = {}
     for k in pem:
         d = pem[k]
         ds = eventUtils.getSorted(d.items(), 1)
         if topK:
             mleEnts[k] = ds[:topK]
         else:
             mleEnts[k] = ds
     return mleEnts
예제 #12
0
 def buildVSMClassifier(self,posFile,vsmClassifierFileName,th,topK):
     
     try:
         classifierFile = open(vsmClassifierFileName,"rb")
         self.classifier = pickle.load(classifierFile)
         classifierFile.close()
     except:
         docs = []
         f = open(posFile,'r')
         for url in f:
             url = url.strip()
             d = Document(url)
             if d and d.text:
                 docs.append(d)
         f.close()
         '''
         docsTF = []
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             docsTF.append(wordsFreq)
         self.classifier = VSMClassifier(docsTF,th)
         '''
         docsTF = []
         vocabTFDic = {}
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             #docsTF.append(wordsFreq)
             for w in wordsFreq:
                 if w in vocabTFDic:
                     vocabTFDic[w] += wordsFreq[w]
                 else:
                     vocabTFDic[w] = wordsFreq[w]
         
         vocabSorted = getSorted(vocabTFDic.items(), 1)
         topVocabDic = dict(vocabSorted[:topK])
         #topVocabDic = vocabTFDic
         
         ndocsTF = []
         '''
         for d in docsTF:
             ndocTF = {}
             for k in topVocabDic:
                 if k in d:
                     ndocTF[k] = d[k]
                 else: 
                     ndocTF[k] = 1/math.e
             ndocsTF.append(ndocTF)
          '''   
         
         self.classifier = VSMClassifier(topVocabDic,ndocsTF,th)
         classifierFile = open(vsmClassifierFileName,"wb")
         pickle.dump(self.classifier,classifierFile)
         classifierFile.close()
예제 #13
0
 def getEntitiesFreq(self,entityList):
     el = [e.lower() for e in entityList]
     
     entitiesWords = []
     for w in el:
         p = w.split()
         if len(p)>1:
             entitiesWords.extend(p)
         else:
             entitiesWords.append(w)
     s = eventUtils.getFreq(entitiesWords)
     s = eventUtils.getSorted(s.items(), 1)
     return s
예제 #14
0
    def getEntitiesFreq(self, entityList):
        el = [e.lower() for e in entityList]

        entitiesWords = []
        for w in el:
            p = w.split()
            if len(p) > 1:
                entitiesWords.extend(p)
            else:
                entitiesWords.append(w)
        s = eventUtils.getFreq(entitiesWords)
        s = eventUtils.getSorted(s.items(), 1)
        return s
예제 #15
0
 def buildVSMClassifier_OneTargetTopicVector(self,posFile,vsmClassifierFileName,th,topK):
     
     try:
         classifierFile = open(vsmClassifierFileName,"rb")
         self.classifier = pickle.load(classifierFile)
         classifierFile.close()
     except:
         docs = []
         f = open(posFile,'r')
         for url in f:
             url = url.strip()
             d = Document(url)
             if d and d.text:
                 docs.append(d)
         f.close()
         '''
         docsTF = []
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             docsTF.append(wordsFreq)
         self.classifier = VSMClassifier(docsTF,th)
         '''
         docsTF = []
         vocabTFDic = {}
         n = len(docs)
         for d in docs:
             wordsFreq = getFreq(d.getWords())
             #docsTF.append(wordsFreq)
             for w in wordsFreq:
                 if w in vocabTFDic:
                     #vocabTFDic[w] += wordsFreq[w]
                     vocabTFDic[w].append( wordsFreq[w])
                 else:
                     vocabTFDic[w] = [wordsFreq[w]]
         #vocTF_IDF = [(w,sum(vocabTFDic[w])*math.log(n*1.0/len(vocabTFDic[w]))) for w in vocabTFDic]
         idf = 1.0
         vocTF_IDF = [(w,sum([1+math.log(vtf) for vtf in vocabTFDic[w]])*idf) for w in vocabTFDic]
          
         #vocabSorted = getSorted(vocabTFDic.items(), 1)
         vocabSorted = getSorted(vocTF_IDF, 1)
         print vocabSorted[:topK]
         topVocabDic = dict(vocabSorted[:topK])
         #topVocabDic = vocabTFDic
          
         
         self.classifier = VSMClassifier(topVocabDic,th)
         classifierFile = open(vsmClassifierFileName,"wb")
         pickle.dump(self.classifier,classifierFile)
         classifierFile.close()
예제 #16
0
 def selectImportantWords_tf(self,k):        
     words_tfidf_sum = []     
     #n = len(self.index.keys())
     i = 0
     for v in self.index.itervalues():
         #l = len(v)
         idf = 1
         #tf = [1 + math.log(t) for t in v]
         #tfidf = idf * sum(tf)
         tf = 1+ math.log(sum(v))
         tfidf = idf * tf
         words_tfidf_sum.append((tfidf,i))
         i = i+1
     self.words_tfidf_sorted = getSorted(words_tfidf_sum, 0)#sorted(words_tfidf_sum,reverse=True)
     selected = self.words_tfidf_sorted
     if len(self.words_tfidf_sorted) > k:
         selected = self.words_tfidf_sorted[:k]
     return selected
예제 #17
0
    def selectImportantWords_tf(self, k):
        words_tfidf_sum = []
        #n = len(self.index.keys())
        i = 0
        for v in self.index.itervalues():
            #l = len(v)
            idf = 1
            #tf = [1 + math.log(t) for t in v]
            #tfidf = idf * sum(tf)

            #tf = 1+ math.log(sum(v))
            tf = sum([1 + math.log(it) for it in v])
            tfidf = idf * tf
            words_tfidf_sum.append((tfidf, i))
            i = i + 1
        self.words_tfidf_sorted = getSorted(
            words_tfidf_sum, 0)  #sorted(words_tfidf_sum,reverse=True)
        selected = self.words_tfidf_sorted
        if len(self.words_tfidf_sorted) > k:
            selected = self.words_tfidf_sorted[:k]
        return selected
예제 #18
0
def extractDatesLocs(urls):
    webpagesTxt = eventUtils.getWebpageText_NoURLs(urls)
    txts = [
        webpageTxt['text'] for webpageTxt in webpagesTxt
        if 'text' in webpageTxt
    ]
    webpageEnts = eventUtils.getEntities(txts)
    #webpageEnts = eventUtils.getEntities(webpageTxt[0]['text'])
    #print webpageEnts[0]['LOCATION']
    #print webpageEnts[0]['DATE']

    locs = []
    dates = []

    for wbE in webpageEnts:
        #print wbE['LOCATION']
        #print wbE['DATE']
        #print '-----------------------'
        if 'LOCATION' in wbE:
            locs.extend(wbE['LOCATION'])
        if 'DATE' in wbE:
            dates.extend(wbE['DATE'])

    freqLocs = eventUtils.getFreq(locs)
    freqDates = eventUtils.getFreq(dates)
    '''
    freqDates_norm = normalizeDates(freqDates)
    sortedDates = eventUtils.getSorted(freqDates_norm.iteritems(),1)
    print sortedDates
    print "Most Frequent Date (i.e. most probably event's date) is: ", sortedDates[0]
    print '________________________________'
    #print freqDates_norm
    '''
    freqLocs_norm = normalizeLocs(freqLocs)
    sortedLocs = eventUtils.getSorted(freqLocs_norm.iteritems(), 1)
    print sortedLocs
    print "Most Frequent Location (i.e. most probably event's location) is: ", sortedLocs[
        0]
    #print freqLocs_norm
    return
예제 #19
0
def extractDatesLocs(urls):
    webpagesTxt = eventUtils.getWebpageText_NoURLs(urls)
    txts = [webpageTxt['text'] for webpageTxt in webpagesTxt if 'text' in webpageTxt]
    webpageEnts = eventUtils.getEntities(txts)
    #webpageEnts = eventUtils.getEntities(webpageTxt[0]['text'])
    #print webpageEnts[0]['LOCATION']
    #print webpageEnts[0]['DATE']
    
    locs = []
    dates = []
    
    for wbE in webpageEnts:
        #print wbE['LOCATION']
        #print wbE['DATE']
        #print '-----------------------'
        if 'LOCATION' in wbE:
            locs.extend(wbE['LOCATION'])
        if 'DATE' in wbE:
            dates.extend(wbE['DATE'])
    
    freqLocs = eventUtils.getFreq(locs)
    freqDates = eventUtils.getFreq(dates)
   
    '''
    freqDates_norm = normalizeDates(freqDates)
    sortedDates = eventUtils.getSorted(freqDates_norm.iteritems(),1)
    print sortedDates
    print "Most Frequent Date (i.e. most probably event's date) is: ", sortedDates[0]
    print '________________________________'
    #print freqDates_norm
    '''
    freqLocs_norm = normalizeLocs(freqLocs)
    sortedLocs = eventUtils.getSorted(freqLocs_norm.iteritems(),1)
    print sortedLocs
    print "Most Frequent Location (i.e. most probably event's location) is: ", sortedLocs[0]
    #print freqLocs_norm
    return
예제 #20
0
 def getCollVec(self,numWords=10):
     n = len(self.docsVecs)
     wordsWeights = [(w,self.index[w]['collFreq'] * math.log(n*1.0/self.index[w]['docFreq'])) for w in self.index]
     wordsWeightsSorted = eventUtils.getSorted(wordsWeights, 1)
     topWords = wordsWeightsSorted[:numWords]
     self.collVec = dict(topWords)
예제 #21
0
 def buildEventModel_old(self,seedURLs):
     
     corpus = Collection(seedURLs)
     #sortedTokensFreqs = corpus.getWordsFrequencies()
     sortedToksTFDF = corpus.getIndicativeWords()
     print sortedToksTFDF
     sortedImptSents = corpus.getIndicativeSentences(self.topK,self.intersectionTh)
     # Get Event Model
     eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents)
     #topToks = [k for k,_ in sortedToksTFDF]
     #if self.topK < len(topToks):
     #    topToks =  topToks[:self.topK]
     #self.entities['Disaster'] = set(topToks)
     
     self.entities['LOCATION']= []
     self.entities['DATE'] = []
     for e in eventModelInstances:
         if 'LOCATION' in e:
             self.entities['LOCATION'].extend( e['LOCATION'])
         elif 'DATE' in e:
             self.entities['DATE'].extend( e['DATE'])
     
     entitiesFreq = {}
     entitiesFreq['LOCATION'] = eventUtils.getFreq(self.entities['LOCATION'])
     entitiesFreq['LOCATION'] = eventUtils.getSorted(entitiesFreq['LOCATION'].items(), 1)
     entitiesFreq['DATE'] = eventUtils.getFreq(self.entities['DATE'])
     entitiesFreq['DATE'] = eventUtils.getSorted(entitiesFreq['DATE'].items(), 1)
     
     l = [k for k,_ in entitiesFreq['LOCATION']]
     if self.topK < len(l):
         #l = l[:self.topK]
         l = l[:3]
     self.entities['LOCATION'] = set(l)
     
     d = [k for k,_ in entitiesFreq['DATE']]
     if self.topK < len(d):
         #d = d[:self.topK]
         d = d[:3]
     self.entities['DATE'] = set(d)
     
     self.entities['LOCATION'] = self.getUniqueEntities(self.entities['LOCATION'])
     
     
     self.entities['DATE'] = self.getUniqueEntities(self.entities['DATE']) 
     
     locDate = list(self.entities['LOCATION']) + list(self.entities['DATE'])
     locDate = eventUtils.getTokens(' '.join(locDate))
     
     ntopToks = []
     topToks = [k for k,_ in sortedToksTFDF]
     for tok in topToks:
         if tok not in locDate:
             ntopToks.append(tok)
     topToks = ntopToks
     if self.topK < len(topToks):
         topToks =  topToks[:self.topK]
     self.entities['Disaster'] = set(topToks)
     
     
     self.allEntities = []
     for k in self.entities:
         self.allEntities.extend(self.entities[k]) 
         
     print self.allEntities
예제 #22
0
 def buildEventModel_wholeCollection(self,seedURLs):
     
     corpus = Collection(seedURLs)
     
     #NoTFDF
     self.toksDic= corpus.getIndicativeWords('TF')
     #sortedImptSents = corpus.getIndicativeSentences(keywordsTh,self.intersectionTh)
     #for s in sortedImptSents[:self.topK]: 
     #    print s 
     # Get Event Model
     docsTexts = [d.text for d in corpus.documents]
     eventModelInstances = eventUtils.getEntities(docsTexts)
     #eventModelInstances = eventUtils.getEventModelInsts(docsTexts)
     #print eventModelInstances[:self.topK]
     
     self.entities['LOCATION']= []
     self.entities['DATE'] = []
     self.entities['Topic']=[]
     
     for e in eventModelInstances:
         if 'LOCATION' in e:
             self.entities['LOCATION'].extend( e['LOCATION'])
         if 'DATE' in e:
             self.entities['DATE'].extend( e['DATE'])
         #self.entities['Topic'].extend(e['Topic'])
     
     entitiesFreq = {}
     entitiesFreq['LOCATION'] = self.getEntitiesFreq(self.entities['LOCATION'])
     entitiesFreq['DATE'] = self.getEntitiesFreq(self.entities['DATE'])
     entitiesFreq['Topic'] = eventUtils.getSorted(self.toksDic.items(), 1)
    
     filteredDates = []
     months = ['jan','feb','mar','apr','aug','sept','oct','nov','dec','january','february','march','april','may','june','july','august','september','october','november','december']
     for d,v in entitiesFreq['DATE']:
         if d.isdigit() and len(d) == 4:
             filteredDates.append((d,v))
         elif d.lower() in months:
             filteredDates.append((d,v))
     entitiesFreq['DATE']=filteredDates
     
     llen = self.topK
     dlen = self.topK
     #l = [k for k,_ in entitiesFreq['LOCATION']]
     s = len(entitiesFreq['LOCATION'])
     
     if llen < s:
         s = llen
     t = entitiesFreq['LOCATION'][:s]
     print t
     self.entities['LOCATION'] = dict(t)
            
     #d = [k for k,_ in entitiesFreq['DATE']]
     s = len(entitiesFreq['DATE'])
     if dlen < s:
         s = dlen
     self.entities['DATE'] = dict(entitiesFreq['DATE'][:s])
     print entitiesFreq['DATE'][:s]
     
     
     #locDate = [k for k,_ in entitiesFreq['LOCATION']] + [m for m,_ in entitiesFreq['DATE']]
     locDate = self.entities['LOCATION'].keys() + self.entities['DATE'].keys()
     
     locDate = eventUtils.getTokens(' '.join(locDate))
     
     
     ntopToks = []
     topToks = [k for k,_ in entitiesFreq['Topic']]
     for tok in topToks:
         if tok not in locDate:
             ntopToks.append(tok)
     topToks = ntopToks
     
     if self.topK < len(topToks):
         topToks =  topToks[:self.topK]
     #print "Disaster: ", topToks
     
     
     topToksDic = {}
     for t in topToks:
         topToksDic[t] = self.toksDic[t]
     #self.entities['Disaster'] = set(topToks)
     self.entities['Topic'] = topToksDic
     
     #print self.entities
     print topToksDic
     
     #self.vecs = {}
     self.scalars = {}
     for k in self.entities:
         ekv = self.entities[k]
         '''
         if k == 'Disaster':
             ev = [1+math.log(e*v) for e,v in ekv.values()]
         else:
             ev = [1+math.log(e) for e in ekv.values()]
         '''
         #NoTFDF
         ev = [1+math.log(e) for e in ekv.values()]
         #self.vecs[k] = ev
         self.scalars[k] = self.getScalar(ev)
예제 #23
0
    def buildEventModel_old(self, seedURLs):

        corpus = Collection(seedURLs)
        #sortedTokensFreqs = corpus.getWordsFrequencies()
        sortedToksTFDF = corpus.getIndicativeWords()
        print sortedToksTFDF
        sortedImptSents = corpus.getIndicativeSentences(
            self.topK, self.intersectionTh)
        # Get Event Model
        eventModelInstances = eventUtils.getEventModelInsts(sortedImptSents)
        #topToks = [k for k,_ in sortedToksTFDF]
        #if self.topK < len(topToks):
        #    topToks =  topToks[:self.topK]
        #self.entities['Disaster'] = set(topToks)

        self.entities['LOCATION'] = []
        self.entities['DATE'] = []
        for e in eventModelInstances:
            if 'LOCATION' in e:
                self.entities['LOCATION'].extend(e['LOCATION'])
            elif 'DATE' in e:
                self.entities['DATE'].extend(e['DATE'])

        entitiesFreq = {}
        entitiesFreq['LOCATION'] = eventUtils.getFreq(
            self.entities['LOCATION'])
        entitiesFreq['LOCATION'] = eventUtils.getSorted(
            entitiesFreq['LOCATION'].items(), 1)
        entitiesFreq['DATE'] = eventUtils.getFreq(self.entities['DATE'])
        entitiesFreq['DATE'] = eventUtils.getSorted(
            entitiesFreq['DATE'].items(), 1)

        l = [k for k, _ in entitiesFreq['LOCATION']]
        if self.topK < len(l):
            #l = l[:self.topK]
            l = l[:3]
        self.entities['LOCATION'] = set(l)

        d = [k for k, _ in entitiesFreq['DATE']]
        if self.topK < len(d):
            #d = d[:self.topK]
            d = d[:3]
        self.entities['DATE'] = set(d)
        '''
        locList = self.entities['LOCATION']
        locSet = set(locList)
        self.entities['LOCATION'] = [l for l in locSet]
        '''
        self.entities['LOCATION'] = self.getUniqueEntities(
            self.entities['LOCATION'])
        '''
        dateList = self.entities['DATE']
        dateSet = set(dateList)
        self.entities['DATE'] = [d for d in dateSet]
        '''
        self.entities['DATE'] = self.getUniqueEntities(self.entities['DATE'])

        locDate = list(self.entities['LOCATION']) + list(self.entities['DATE'])
        locDate = eventUtils.getTokens(' '.join(locDate))

        ntopToks = []
        topToks = [k for k, _ in sortedToksTFDF]
        for tok in topToks:
            if tok not in locDate:
                ntopToks.append(tok)
        topToks = ntopToks
        if self.topK < len(topToks):
            topToks = topToks[:self.topK]
        self.entities['Disaster'] = set(topToks)

        self.allEntities = []
        for k in self.entities:
            self.allEntities.extend(self.entities[k])

        print self.allEntities