def __init__(self,word):  
     self.word = word  
     self.freq_word = 0#freq of the words in document
     self.freq_together = 0# frq of that word occuring with that keyword
     self.doc_freq_obj = doc_freq_class.context()
    def get_words_from_proximity(self,keywords,text):  #think of how to get nouns from sentence only... !!
        #create object of doc_frequency

        doc_freq_obj = doc_freq_class.context()
    	
        tokens = nltk.word_tokenize(text)
        #print "tokens:"
        #print tokens
        for i in tokens:
            if i.isalnum()== False:
                tokens.remove(i)
            
        c = nltk.ConcordanceIndex(tokens, key = lambda s: s.lower())
      
        tokens_pos = nltk.pos_tag(tokens)  
        i = 5
        doc_freq = []
        df_cnt = 0

        print "keywords going to loop",
        print keywords
        for kw in keywords:
            print "keyword::::::::",
            print kw
			#split keyword not required as kw is list of strings
            #k = nltk.word_tokenize(kw)
            #print k
            
            #print "keywords in for ", 
            #print kw
            first_word = kw[0]        #1st word in keyword
            #print "first word"
            #print first_word
            keyword_len = len(kw)
            #print "LEN="+str(keyword_len)
            i = 5
            nomatch = 0
            #print "IN KWD LOOP."
            print "offset",
            print c.offsets(first_word)
            
            for offset in c.offsets(first_word):
                print kw
                j = 1
                i = 5
                #print "Keyword=",
                #print kw,
                #print " OFFSET=" + str(offset) 
                nomatch = 0
                while j < keyword_len:
                    #print "in while"
                    #print tokens[offset+j]
                    #print kw[j]
                    
                    if tokens[offset+j].lower() <> kw[j].lower():
                        #print tokens[offset+j]
                        #print k[j]
                	nomatch = 1
                       	break 
                    j = j + 1
                if nomatch == 0:
                    doc_freq.append(document_frequency(kw))
                    #print "matched kwd",
                    #print tokens[offset:offset+j-1]
                    #print tokens[offset-5:offset+5]
                    i = 5
                    while i > 0 :
                	if (offset-i) < 0:
                    	    break
                        
                    	if (tokens_pos[offset-i][1] in ["NN","NNP"]) and (tokens_pos[offset-i][1].lower() not in  nltk.corpus.stopwords.words('english')):
                       		#doc_freq_obj.get_together_DF("")
                            #print "dfcnt:" + str(df_cnt) 
                            #print "i: " + str(i)
                            doc_freq[df_cnt].addneighbour(tokens_pos[offset-i][0])
                       		
                            print tokens_pos[offset-i][0],
                            
                            #pass
                    	i = i - 1
                
                
                    print "\m/ ",
                    print kw,
                    print "\m/ ",
                    i = 1
           
                    while i < 5 :
                        if (offset+i+(keyword_len-1)) >= len(tokens):
                            break
    	
                        if (tokens_pos[offset+i+(keyword_len-1)][1] in ["NN","NNP"]) and (tokens_pos[offset+i+(keyword_len-1)][1].lower() not in  nltk.corpus.stopwords.words('english')): 
                            #pass
                            doc_freq[df_cnt].addneighbour(tokens[offset+i+(keyword_len-1)])
                            print tokens_pos[offset+i+(keyword_len-1)][0],
                
	        
                        i = i + 1
                    k = 0
                    print "\n\n"    
                    while k < doc_freq[df_cnt].cnt:
                        #doc_freq[df_cnt].neighbours[k].freq_word = fd1[context_vectors[CV_cnt].keyword]
                        doc_freq[df_cnt].neighbours[k].find_doc_freq(doc_freq[df_cnt].keyword)
                        k = k + 1

                    doc_freq[df_cnt].neighbours.sort(key=lambda x: x.freq_together, reverse=True)
                    if doc_freq[df_cnt].cnt > 5:
                        doc_freq[df_cnt].neighbours = doc_freq[df_cnt].neighbours[:5]        #take 10 neighbours with highest weight
                        doc_freq[df_cnt].cnt = 5
                    k = 0
                    #while k < doc_freq[df_cnt].cnt:
                    print "keyword: ",
                    for l in doc_freq[df_cnt].keyword: 
                        print  l,
                    print "\n"
                    print  "neighbours: ",
                    for m in doc_freq[df_cnt].neighbours:
                        print m.word,
                        print "\n"
                        #k += 1
                    
                    
                    df_cnt = df_cnt + 1
        results = search_web(doc_freq)
        return results
Пример #3
0
    def get_words_from_proximity(
            self, keyword_list,
            text):  #think of how to get nouns from sentence only... !!
        #create object of doc_frequency

        doc_freq_obj = doc_freq_class.context()

        tokens = nltk.word_tokenize(text)
        #print "tokens:"
        #print tokens
        for i in tokens:
            if i.isalnum() == False:
                tokens.remove(i)

        c = nltk.ConcordanceIndex(tokens, key=lambda s: s.lower())

        tokens_pos = nltk.pos_tag(tokens)
        i = 5
        doc_freq = []
        df_cnt = 0

        print "keywords going to loop",
        print keyword_list

        keywords = []
        for k in keyword_list:
            kw = nltk.word_tokenize(k)
            keywords.append(kw)

        print "keywords"
        print keywords

        for kw in keywords:
            print "keyword::::::::",
            print kw
            #split keyword not required as kw is list of strings
            #k = nltk.word_tokenize(kw)
            #print k

            #print "keywords in for ",
            #print kw
            first_word = kw[0]  #1st word in keyword
            #print "first word"
            #print first_word
            keyword_len = len(kw)
            #print "LEN="+str(keyword_len)
            i = 5
            nomatch = 0
            #print "IN KWD LOOP."
            print "offset",
            print c.offsets(first_word)
            doc_freq.append(document_frequency(kw))
            no_of_times = 0
            for offset in c.offsets(first_word):
                print kw
                j = 1
                i = 5
                #print "Keyword=",
                #print kw,
                #print " OFFSET=" + str(offset)
                nomatch = 0
                while j < keyword_len:
                    #print "in while"
                    #print tokens[offset+j]
                    #print kw[j]

                    if tokens[offset + j].lower() <> kw[j].lower():
                        #print tokens[offset+j]
                        #print k[j]
                        nomatch = 1
                        break
                    j = j + 1
                if nomatch == 0:

                    #print "matched kwd",
                    #print tokens[offset:offset+j-1]
                    #print tokens[offset-5:offset+5]
                    i = 5
                    while i > 0:
                        if (offset - i) < 0:
                            break

                        if (tokens_pos[offset - i][1] in [
                                "NN", "NNP"
                        ]) and (tokens_pos[offset - i][1].lower()
                                not in nltk.corpus.stopwords.words('english')):
                            #doc_freq_obj.get_together_DF("")
                            #print "dfcnt:" + str(df_cnt)
                            #print "i: " + str(i)
                            doc_freq[df_cnt].addneighbour(tokens_pos[offset -
                                                                     i][0])

                            print tokens_pos[offset - i][0],

                            #pass
                        i = i - 1

                    print "\m/ ",
                    print kw,
                    print "\m/ ",
                    i = 1

                    while i < 5:
                        if (offset + i + (keyword_len - 1)) >= len(tokens):
                            break

                        if (tokens_pos[offset + i + (keyword_len - 1)][1] in [
                                "NN", "NNP"
                        ]) and (tokens_pos[offset + i +
                                           (keyword_len - 1)][1].lower()
                                not in nltk.corpus.stopwords.words('english')):
                            #pass
                            doc_freq[df_cnt].addneighbour(
                                tokens[offset + i + (keyword_len - 1)])
                            print tokens_pos[offset + i +
                                             (keyword_len - 1)][0],

                        i = i + 1
                    k = 0
                    print "\n\n"
                    while k < doc_freq[df_cnt].cnt:
                        #doc_freq[df_cnt].neighbours[k].freq_word = fd1[context_vectors[CV_cnt].keyword]
                        doc_freq[df_cnt].neighbours[k].find_doc_freq(
                            doc_freq[df_cnt].keyword)
                        k = k + 1

                    doc_freq[df_cnt].neighbours.sort(
                        key=lambda x: x.freq_together, reverse=True)
                    if doc_freq[df_cnt].cnt > 5:
                        doc_freq[df_cnt].neighbours = doc_freq[
                            df_cnt].neighbours[:
                                               5]  #take 10 neighbours with highest weight
                        doc_freq[df_cnt].cnt = 5
                    k = 0
                    #while k < doc_freq[df_cnt].cnt:
                    print "keyword: ",
                    for l in doc_freq[df_cnt].keyword:
                        print l,
                    print "\n"
                    print "neighbours: ",
                    for m in doc_freq[df_cnt].neighbours:
                        print m.word,
                        print "\n"
                        #k += 1
                no_of_times = no_of_times + 1
                if no_of_times >= 2:
                    break

            #import pdb;pdb.set_trace();
            df_cnt = df_cnt + 1
        #results = search_web(doc_freq)
        print doc_freq
        return doc_freq
Пример #4
0
 def __init__(self, word):
     self.word = word
     self.freq_word = 0  #freq of the words in document
     self.freq_together = 0  # frq of that word occuring with that keyword
     self.doc_freq_obj = doc_freq_class.context()
Пример #5
0
def main():
    obj = proper_noun()
    text = obj.scrape(sys.argv[1])
    title = alchemyObj.URLGetTitle(sys.argv[1])
    soup = BeautifulSoup(title)
    raw = soup('title')
    tokens_title_first = [str(title.text) for title in raw]
    #tokens_title = ['Three', 'Musketeers']
    print "title::",
    print tokens_title_first
    #text = original
    ### Take nouns in title
    tokens_title_first = str(tokens_title_first[0])
    print tokens_title_first
    tokens_title_temp = nltk.word_tokenize(tokens_title_first)
    tokens_title_pos = nltk.pos_tag(tokens_title_temp)
    print "tokens_title_temp::",
    print tokens_title_temp

    tokens_title = []      ##create duplicate list
    for t in tokens_title_temp:
    	index = tokens_title_temp.index(t)
        print "t::" + t
        print "index::" + str(index) 
        print "tag::" + tokens_title_pos[index][1]
        print "len" + str(len(t))
    	if (t.isalpha() and (tokens_title_pos[index][1] == "NNP") and (len(t) >= 3)):
           # tokens_title.remove(t)
            tokens_title.append(t)
    tokens_title.sort()
    tokens_title = list(tokens_title for tokens_title,_ in itertools.groupby(tokens_title))
    print "title::",
    print tokens_title 
    list_of_NNPs = obj.get_nnp_ngrams(text,5,0)
    
    #list_of_NNPs = [['Three','Musketeers'],['Alexandre', 'Dumas']]#,['Cardinal', 'Richelieu'],['Athos'],['Although'],['Porthos'] ]
    print "list of NNPs: ",
    print list_of_NNPs
    if len(list_of_NNPs)>3: ######
        list_of_NNPs = list_of_NNPs[0:3] ########
    doc_freq_obj = doc_freq_class.context()
    print "getting doc freq"
    max_df = []
    for n in list_of_NNPs:
        print "got n"
        max_freq = 0
        for t in tokens_title:
            print "got t"
            df = doc_freq_obj.get_together_DF(n,t)
            if df > max_freq:
                max_freq = df
            print "ngram:",
            print n
            print "title word:",
            print t
            print "df:",
            print df
        max_df.append(max_freq)
    i = 0
    for df in max_df:
        for i in range(len(max_df)-1):
            if max_df[i]<max_df[i+1]:
                t = list_of_NNPs[i]
                list_of_NNPs[i]=list_of_NNPs[i+1]
                list_of_NNPs[i+1]= t
                t1 = max_df[i]
                max_df[i]=max_df[i+1]
                max_df[i+1] = t1
    #i = 0
    for i in range(len(list_of_NNPs)):
        print "keyword: ",
        print list_of_NNPs[i] 
        print "df:",
        print max_df[i]
    if len(list_of_NNPs)>3:
        list_of_NNPs = list_of_NNPs[0:3]#*********
    #list_of_NNPs.sort()
    #list_of_NNPs_final = list(list_NNPs for list_NNPs,_ in itertools.groupby(list_of_NNPs))
    #list_of_NNPs_final.sort()
    print "\n\nfinal list:",
    print list_of_NNPs
    nearbywordsObj = getnearbywords_intokens.getnearbywords()
    nearbywordsObj.get_words_from_proximity(list_of_NNPs,text) 
Пример #6
0
    def keywords(self, url, text):

        #        import pdb;pdb.set_trace();
        title = alchemyObj.URLGetTitle(url)
        soup = BeautifulSoup(title)
        raw = soup('title')
        tokens_title_first = [str(title.text) for title in raw]

        print "title::",
        print tokens_title_first
        #text = original
        ### Take nouns in title
        tokens_title_first = str(tokens_title_first[0])
        print tokens_title_first
        tokens_title_temp = nltk.word_tokenize(tokens_title_first)
        tokens_title_pos = nltk.pos_tag(tokens_title_temp)
        print "tokens_title_temp::",
        print tokens_title_temp

        tokens_title = []  ##create duplicate list
        for t in tokens_title_temp:
            index = tokens_title_temp.index(t)
            print "t::" + t
            print "index::" + str(index)
            print "tag::" + tokens_title_pos[index][1]
            print "len" + str(len(t))
            if (t.isalpha() and (tokens_title_pos[index][1] == "NNP")
                    and (len(t) >= 3)):
                # tokens_title.remove(t)
                tokens_title.append(t)
        tokens_title.sort()
        tokens_title = list(
            tokens_title
            for tokens_title, _ in itertools.groupby(tokens_title))
        print "title::",
        print tokens_title
        list_of_NNPs = self.get_nnp_ngrams(text, 5, 0)

        print "list of NNPs: ",
        print list_of_NNPs
        doc_freq_obj = doc_freq_class.context()
        print "getting doc freq"
        max_df = []
        # for n in list_of_NNPs:
        #print "got n"
        #    max_freq = 0
        #   for t in tokens_title:
        #   print "got t"
        #      df = doc_freq_obj.get_together_DF(n,t)
        #      if df > max_freq:
        #          max_freq = df
        #      print "ngram:",
        #      print n
        #      print "title word:",
        #      print t
        #      print "df:",
        #      print df
        #  max_df.append(max_freq)
        #i = 0
        #for df in max_df:
        #    for i in range(len(max_df)-1):
        #        if max_df[i]<max_df[i+1]:
        #           t = list_of_NNPs[i]
        #           list_of_NNPs[i]=list_of_NNPs[i+1]
        #           list_of_NNPs[i+1]= t
        #           t1 = max_df[i]
        #           max_df[i]=max_df[i+1]
        #           max_df[i+1] = t1

        # for i in range(len(list_of_NNPs)):
        #     print "keyword: ",
        #     print list_of_NNPs[i]
        #     print "df:",
        #     print max_df[i]
        # print "\n\nfinal list:",
        # print list_of_NNPs
        return list_of_NNPs
Пример #7
0
    def keywords(self,url,text):
        
#        import pdb;pdb.set_trace();
        title = alchemyObj.URLGetTitle(url)
        soup = BeautifulSoup(title)
        raw = soup('title')
        tokens_title_first = [str(title.text) for title in raw]
   
        print "title::",
        print tokens_title_first
    #text = original
    ### Take nouns in title
        tokens_title_first = str(tokens_title_first[0])
        print tokens_title_first
        tokens_title_temp = nltk.word_tokenize(tokens_title_first)
        tokens_title_pos = nltk.pos_tag(tokens_title_temp)
        print "tokens_title_temp::",
        print tokens_title_temp

        tokens_title = []      ##create duplicate list
        for t in tokens_title_temp:
            index = tokens_title_temp.index(t)
            print "t::" + t
            print "index::" + str(index) 
            print "tag::" + tokens_title_pos[index][1]
            print "len" + str(len(t))
            if (t.isalpha() and (tokens_title_pos[index][1] == "NNP") and (len(t) >= 3)):
           # tokens_title.remove(t)
                tokens_title.append(t)
        tokens_title.sort()
        tokens_title = list(tokens_title for tokens_title,_ in itertools.groupby(tokens_title))
        print "title::",
        print tokens_title 
        list_of_NNPs = self.get_nnp_ngrams(text,5,0)
    
   
        print "list of NNPs: ",
        print list_of_NNPs
        doc_freq_obj = doc_freq_class.context()
        print "getting doc freq"
        max_df = []
       # for n in list_of_NNPs:
            #print "got n"
        #    max_freq = 0
         #   for t in tokens_title:
             #   print "got t"
          #      df = doc_freq_obj.get_together_DF(n,t)
          #      if df > max_freq:
          #          max_freq = df
              #      print "ngram:",
              #      print n
              #      print "title word:",
              #      print t
              #      print "df:",
              #      print df
          #  max_df.append(max_freq)
        #i = 0
        #for df in max_df:
        #    for i in range(len(max_df)-1):
        #        if max_df[i]<max_df[i+1]:
         #           t = list_of_NNPs[i]
         #           list_of_NNPs[i]=list_of_NNPs[i+1]
         #           list_of_NNPs[i+1]= t
         #           t1 = max_df[i]
         #           max_df[i]=max_df[i+1]
         #           max_df[i+1] = t1
   
       # for i in range(len(list_of_NNPs)):
       #     print "keyword: ",
       #     print list_of_NNPs[i] 
       #     print "df:",
       #     print max_df[i]
       # print "\n\nfinal list:",
       # print list_of_NNPs
        return list_of_NNPs