def hello(self, sentence): ''' tokenized_sentence = word_tokenize(sentence) punctuation = re.compile(r'[-.?!,":;()|0-9]') tokenized_sentence = list(filter(None,tokenized_sentence)) tokenized_sentence = [punctuation.sub("", word) for word in tokenized_sentence] extracted = [] for w in tokenized_sentence: if (w.lower() not in stopwords.words('english') and w != ""): extracted.append(w) tagged_sent = pos_tag(extracted) interest_types = ["NN","NNP","NNS","VBG","VB"] for tagged in tagged_sent: word_type = tagged[1] if word_type in interest_types: if (tagged[0] not in extracted and tagged[0] != ""): extracted.append(tagged[0]) importantwords = ', '.join(extracted) ''' extracted = [] rake = Rake("SmartStoplist.txt") keywords = rake.run(sentence) return json.dumps([dict(name=keyword[0],weight=keyword[1]) for keyword in keywords])
def hello(self, sentence): ''' tokenized_sentence = word_tokenize(sentence) punctuation = re.compile(r'[-.?!,":;()|0-9]') tokenized_sentence = list(filter(None,tokenized_sentence)) tokenized_sentence = [punctuation.sub("", word) for word in tokenized_sentence] extracted = [] for w in tokenized_sentence: if (w.lower() not in stopwords.words('english') and w != ""): extracted.append(w) tagged_sent = pos_tag(extracted) interest_types = ["NN","NNP","NNS","VBG","VB"] for tagged in tagged_sent: word_type = tagged[1] if word_type in interest_types: if (tagged[0] not in extracted and tagged[0] != ""): extracted.append(tagged[0]) importantwords = ', '.join(extracted) ''' extracted = [] rake = Rake("SmartStoplist.txt") keywords = rake.run(sentence) return json.dumps( [dict(name=keyword[0], weight=keyword[1]) for keyword in keywords])
def generate_rake_keywords(self): if os.path.exists(self.keyword_file) and os.path.exists( self.rake_score_file): print( 'Already have the files [' + self.keyword_file + ', ' + self.rake_score_file + ']', ', directly load them.') self.load() return self.dataset.load() if self.path is None: self.path = self.dataset.default_path ger_stop_words = stopwords.words('german') stop_words = stopwords.words('english') stop_words.extend(ger_stop_words) stop_words.extend(['via', 'using', 'fr']) r = Rake(stop_words) r.extract_keywords_from_sentences(self.dataset.titles) path = os.path.join(self.path, 'keyword.dat') print('generate keywords', end='', flush=True) with open(path, 'wb') as f: i = 0 for title in self.dataset.titles: i += 1 if i % 100000 == 0: print('.', end='', flush=True) phrases = r.generate_phrases(title) phrases = [' '.join(phrase) for phrase in phrases] self.keywords.append(phrases) pickle.dump(self.keywords, f) self.rake_scores = r.phrase_score path = os.path.join(self.path, 'rake_score.dat') with open(path, 'wb') as f: pickle.dump(self.rake_scores, f) print('done')
def get_phrases(sents,search_text,res_ind): ''' :param sents: list of sentences for search :param search_text: search text :res_ind: indices of best matching sents :return: phrases from query and top results ''' full_text=' . '.join([sents[i] for i in res_ind]) full_text = full_text +' . '+search_text rake = Rake() keys = rake.run(full_text) print keys query_phrases=[] query_words=word_tokenize(search_text) for phr,score in keys: words=word_tokenize(phr) flag_present=1 for word in words: if word not in query_words: flag_present=0 if flag_present==1: query_phrases.append((phr,score)) print query_phrases ###change the phrase to all possible synonyms, find the phrase with maximum match ###look for the nearest answer type to that phrase return keys
def get_keywords(text): """ Gets main keywords using RAKE Algorithm """ rake = Rake("SmartStoplist.txt") keywords = rake.run(text) return [k[0] for k in keywords if len(k[0].split(" ")) <= 2 and k[1] > 1]
def get_keywords(text, stopwords="SmartStoplist.txt"): #commented out text below uses the rake-tutorial code, which I like better, but is less recently updated #https://github.com/zelandiya/RAKE-tutorial #phrase_max_words = 3 #min_word_chars = 5 #min_kw_repeat_rate = 4 #rake = Rake(stopwords, min_word_chars, phrase_max_words, min_kw_repeat_rate) rake = Rake(stopwords) keywords = rake.run(text) return keywords
def get_rake_kp(file_name, topk): json_file = open(file_name, 'r', encoding='utf-8') rake_kp = [] for line in json_file.readlines(): json_data = json.loads(line) cur_content = json_data['title'].strip().lower( ) + ' ' + json_data['abstract'].strip().lower() content_list = nltk.word_tokenize(cur_content) rake = Rake() keywords_dict = rake.run(cur_content) keywords_list = list(keywords_dict.keys())[:topk] kp_list = get_kp(content_list, keywords_list) rake_kp.append(kp_list) json_file.close() return rake_kp
def get_sentence_keyword_score(document, num_sentences): rake = Rake() keywords = rake.get_keywords(document) ranked_keywords = rake.generate_keyword_rank(keywords) sufficient_keywords_length = int(math.ceil(len(ranked_keywords) / 4.0)) sufficient_keywords = ranked_keywords[:sufficient_keywords_length] total_keyword_score = 0.0 # value of a keyword is its relative score value divided by the score of all keywords sentence_keyword_score = [0.0] * num_sentences for keyword in sufficient_keywords: total_keyword_score += keyword['score'] for keyword in sufficient_keywords: sentence_keyword_score[ keyword['sentence_num']] += keyword['score'] / total_keyword_score return sentence_keyword_score
def extract_sentiment_nltk(reviews): rake = Rake("SmartStoplist.txt") for item_id, review in reviews: pos_keyword_list, neg_keyword_list = extract_keywords_with_sentiment(review, rake) print "pos: {}".format(pos_keyword_list) print "neg: {}".format(neg_keyword_list) print "\n"
def extract_titles_rake(reviews): rake = Rake("SmartStoplist.txt") for item_id, review in reviews: print "Review: {}".format(review) title = extract_title_rake(review, rake) print title print "\n"
def abstract_analyze(pdf, abstract): match_word_file = "Matchlist.txt" match = load_match_words(match_word_file) stop_words_path = "SmartStoplist.txt" r = Rake(stop_words_path) temp = r.run(abstract) matched = [] for item in temp: if (item[1] >= 3): #以分数3的界限分隔 matched.append(item) matched = temp flag = False for item in matched: if (item[0] in match): list3.append(pdf) flag = True break if (flag == False): list4.append(pdf)
def keyword_title(title_corpus): ## here we need NLTK stopwords and punkt, will storaged in /usr/share/nltk_data # uncomment to download #nltk.download('stopwords') nltk.download('punkt') title_dict = {} for t in title_corpus: key = (t[3], t[4]) if key in title_dict: title_dict[key].append(t[1]) else: title_dict[key] = [] title_dict[key].append(t[1]) # extract keywords with year span title_years = {} for k, v in title_dict.items(): key = (k[0], ) # year index if key in title_years.keys(): title_years[key].append(v) else: title_years[key] = [] title_years[key].append(v) for k, v in title_years.items(): r = Rake() vs = [item.rstrip('\n') for sublist in v for item in sublist] # a list of strings where each string is a sentence #r.extract_keywords_from_sentences(vs) #print('The keywords for year:{}'.format(str(k[0]))) #print(r.get_ranked_phrases_with_scores()[0:10]) title_txt = '''.'''.join(vs) title_txt.strip('\n') r.extract_keywords_from_text(title_txt) print('The keywords for year:{}'.format(str(k[0]))) # to get keyword phrases ranked from hightest to lowest with scores print(r.get_ranked_phrases_with_scores()[0:10])
def index(): if request.method == "POST": job_description = request.form["description"] job_title = request.form["title"] rake = Rake("all_stop_words.txt") keyword_tuples = rake.run(job_description) keyword_dict = turn_tuple_to_dict(keyword_tuples) important_sentences = summarize(job_title, job_description) common_words = get_common_words(keyword_dict, important_sentences) return render_template("results.html", keywords=keyword_dict, summaries=important_sentences, common_words = common_words) return render_template('index.html')
def get_keyword(text): rake = Rake("SmartStoplist.txt") if text == "": return "" keywords = rake.run(text) return keywords[0][0]
from pandas import DataFrame #given the list of stopwords from nltk.corpus.stopwords.words and also added some more based on the taken text stopwords=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't",'across', 'needs','called','together','creates','tells','yet','1996','shows','following','discussed'] #given the list of punctuations based on string.punctuations and also added some more based on the text punctuations=['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>','."',';}', '(...);', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '<>', '[]', '()', '/*', '*/', '("', '")', ');', '//', '...','!"','•',"''",'",','""','','[])','".', '<<','>>','<<<','!)','(/*','*)','().','();','=='] file = open("jbn.txt","r") #text taken as data text = file.read() r = Rake(punctuations=punctuations,stopwords=stopwords,ranking_metric=2) r.extract_keywords_from_text(text) #if words are to be stored in a text file # file1 = open("keywords.txt","w") # for i in range(len(r.rank_list)//2): #half of the list of phrases found are considered to be as keywords # file1.write(str(r.rank_list[i][0])+" "+r.rank_list[i][1]) # file1.write("\n") # file1.close() #if words are to be stored in an excel sheet keywords = [] weights = [] for i in range(len(r.rank_list)//2): #half of the list of phrases found are considered to be as keywords keywords.append(r.rank_list[i][1])
def get_key_phrases(document, stop_list): r = Rake(stop_list) keywords = r.run(document.lower()) phrase_list = [word[0] for word in keywords if len(word[0].split(" ")) < 4] return phrase_list
'lol': ['league of legends', 'lol', 'riot'], 'dota2': [ 'dota', 'dota2', 'defense of the ancients', 'defense of the ancients 2', 'the international', 'ti7' ], 'csgo': [ 'csgo', 'counter-strike', 'counter strike', 'cs-go', 'counter-strike:global offensive' ], 'overwatch': ['overwatch'], 'wow': ['wow', 'world of warcraft'], 'hots': ['hots', 'heroes of the storm'], 'sc': ['starcraft 2', 'starcraft', 'sc2'] } rake = Rake("SmartStoplist.txt", max_words_length=MAX_WORD_LENGTH) def similar(a, b): if type(a) is list: similar_keywords = 0 for a_val in a: for b_val in b: if Levenshtein.ratio(a_val, b_val) > KEYWORD_SIMILARITY_RATIO: similar_keywords += 1 return similar_keywords >= SIMILAR_KEYWORD_REQ return Levenshtein.ratio(a, b) > KEYWORD_SIMILARITY_RATIO def remove_tags(text): TAG_RE = re.compile(r'<[^>]+>')
def extract_product(html_content, url): #String Buffer string_buffer = "" errs = list() #Read page and read to extract product infomation parser = BeautifulSoup(html_content, "html.parser") #Check if the page is a product, if not skip page. truth, asin = check_page(parser) if not truth: errs.append("Not product") return (False, errs) #New Product as a object product = Product() #New Keyword rank keyword = Rake(SmartStopList.words()) #Find URL product.SetUrl(url) #Find Brand: Note: Some products have an image for the brand truth, string_buffer = search_table( parser, {"id": "productDetails_techSpec_section_1"}, "Brand Name") if truth: product.SetBrand(string_buffer) else: string_buffer = parser.find("a", attrs={"id": "brand"}) if string_buffer != None: product.SetBrand(string_buffer.get_text().strip()) else: errs.append("Could not find Brand") #Find Title string_buffer = parser.find("span", attrs={"id": "productTitle"}) if string_buffer != None: product.SetTitle(string_buffer.get_text().strip()) else: errs.append("Could not find Title") return (False, errs) #Find Image string_buffer = parser.find("img", attrs={"id": "landingImage"}) if string_buffer != None: string_buffer = string_buffer.get("data-old-hires") if len(string_buffer) < 2: string_buffer = parser.find("img", attrs={ "id": "landingImage" }).get("data-a-dynamic-image") m = re.search('https://(.+?).jpg', string_buffer) if m: string_buffer = m.group(1) string_buffer = "https://{}.jpg".format(string_buffer) #print ("Img Url: "+string_buffer) product.SetImage(string_buffer) else: errs.append("Could not find Image") #Find Small Blob #TODO: Need to perform keyword analysis string_buffer = parser.find("div", attrs={"id": "feature-bullets"}) if string_buffer != None: string_buffer = string_buffer.find("ul") try: string_buffer = string_buffer.find_all("li") if string_buffer != None: string_buffer_2 = "" for span in string_buffer: string_buffer_3 = span.find("span") if string_buffer_3 != None: string_buffer_3 = string_buffer_3.get_text() try: string_buffer_2 = "{} {}".format( string_buffer_2, string_buffer_3.strip()) except: pass saved_buffer = string_buffer_2.strip() #Calculating Key Words keywords_1 = keyword.run(saved_buffer) product.SetSmallBlog(keywords_1) except: errs.append("Error finding li") else: errs.append("Could not find small section keywords") #Find Large Blob #TODO: Need to perform keyword analysis string_buffer = parser.find("div", attrs={"id": "productDescription"}) if string_buffer != None: string_buffer = string_buffer.find("p") if string_buffer != None: string_buffer = string_buffer.get_text() saved_buffer = string_buffer.strip() #Calculating Key Words keywords_2 = keyword.run(saved_buffer) product.SetLargeBlob(keywords_2) else: errs.append("Could not find large section keywords") #Find ASIN product.SetSourceID(asin) #TODO: Perform price save! #Append the product to large list of products if product.FormCompleted(): return (product, errs) else: return (False, errs)
from rake import Rake from nltk.corpus import stopwords starttime = datetime.datetime.now() ger_stop_words = stopwords.words('german') stop_words = stopwords.words('english') stop_words.extend(ger_stop_words) stop_words.extend(['via', 'using', 'fr']) r = Rake(stop_words) with open(r'dblp_index/title.dat', encoding='utf-8') as f_title: titles = [] for line in f_title: titles.append(line) r.extract_keywords_from_sentences(titles) print('generate keywords', end='', flush=True) with open(r'dblp_index/keywords.dat', 'a', encoding='utf-8') as keywords: with open(r'dblp_index/title.dat', encoding='utf-8') as titles: i = 0 for line in titles: i += 1 if i % 10000 == 0: print('.', end='', flush=True) phrases = r._generate_phrases(line) phrases_scores = [] for phrase in phrases: true_phrase = '_'.join(phrase) score = r.phrase_score[true_phrase] phrases_scores.append(true_phrase + ":" + str(score)) keywords.write(','.join(phrases_scores) + '\n') print('done')
from rake import Rake rake = Rake() text = "杉山古墳(すぎやまこふん)は、奈良県奈良市大安寺にある古墳。形状は前方後円墳。大安寺古墳群を構成する古墳の1つ。国の史跡に指定されている(史跡「大安寺旧境内 附 石橋瓦窯跡」のうち)。" print(rake.get_keywords(text, 3)) """ Output Keyword List ['ぎやまこふん', '前方後円墳', '大安寺'] """
def __getMainWords__(self, userInput): rake = Rake("SmartStoplist.txt") keywords = rake.run(userInput) return keywords
#coding: utf-8 #util from read_conf import config import csv from optparse import OptionParser import cPickle as pickle import os import re from itertools import combinations #rake from rake import Rake rake = Rake() #nltk import nltk from nltk.util import clean_html from nltk.util import clean_url #nlp from nlp import nlp mnlp = nlp() tag_re = re.compile(r"<p>(.+?)</p>", re.DOTALL) dp = config("../conf/dp.conf") #这个函数的作用是去重 #先读取title,然后和test的title相对比,看看有没有重的
import sys import csv_io import sets import parser reload(sys) sys.setdefaultencoding("utf-8") from textblob import TextBlob from collections import Counter from rake import Rake results = {} rake = Rake("SmartStoplist.txt") users = parser.getUsers() with open("data/edinburgh_restaurant_reviews.json") as f: data = json.loads(f.readline()) pos_polarity = 0 neg_polarity = 0 for business_id in data: results[business_id] = {} for review in data[business_id]: b = TextBlob(review["text"]) if b.sentiment.polarity >= 0: pos_polarity += b.sentiment.polarity else:
word_type = tagged[1] if word_type in interest_types: if (tagged[0] not in extracted and tagged[0] != ""): extracted.append(tagged[0]) importantwords = ', '.join(extracted) # print (importantwords) fdist = FreqDist(extracted) # print (fdist) # print (fdist.most_common(50)) rake = Rake("SmartStoplist.txt") keywords = rake.run(sentence) # print (keywords) for keyword in keywords: word = keyword[0] # print (word) response = requests.get('http://en.wikipedia.org/wiki/Led_Zeppelin'); soup = BeautifulSoup(response.text) content = soup.find(id='mw-content-text')
chapter_tree += [{ 'sno': '[\d]*', 'level': -1, 'title': 'Exercises', 'pno': chapter_tree[-1]['pno'] + 1 }] # to ensure the last section is processed as per my current logic try: os.stat('resources/' + pdfname) except: os.mkdir('resources/' + pdfname) finally: os.chdir('resources/' + pdfname) unigram_rake = Rake('../stopwords.txt', 3, 1, 3) bigram_rake = Rake('../stopwords.txt', 3, 2, 3) trigram_rake = Rake('../stopwords.txt', 3, 3, 2) keywords = set() if split_mode == CHAPTER_MODE: chapter_tree = filter(lambda x: x['level'] == 1, chapter_tree) pages = pages[40:] skip = [] preprocessed_sections = [] for i, (cur_topic, next_topic) in enumerate(zip(chapter_tree[:-1], chapter_tree[1:])): if next_topic['level'] != -1:
dirname = pdfname try: os.stat('../resources/' + dirname) except: os.mkdir('../resources/' + dirname) finally: os.chdir('../resources/' + dirname) with open('__Sections.csv', 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['Section No.', 'Level', 'Section', 'Page No.']) writer.writerows([[c['sno'], c['level'], c['title'], c['pno']] for c in chapter_tree]) unigram_rake = Rake('../stopwords.txt', 3, 1, 3) bigram_rake = Rake('../stopwords.txt', 3, 2, 3) trigram_rake = Rake('../stopwords.txt', 3, 3, 2) keywords = set() if split_mode == CHAPTER_MODE: chapter_tree = list( filter(lambda x: int(x['level']) in [1, -1], chapter_tree)) else: chapter_tree = list( filter(lambda x: int(x['level']) in [1, 2, -1], chapter_tree)) preprocessed_sections = [] for i, (cur_topic, next_topic) in enumerate(zip(chapter_tree[:-1], chapter_tree[1:])): if next_topic['level'] != -1:
def getRakeKeywords(doc): r = Rake(path.join('', cur_dir+'/SmartStoplist.txt')) candidates = r.run(open(doc).read().replace('\n',' ')) return candidates[:300]
word_type = tagged[1] if word_type in interest_types: if (tagged[0] not in extracted and tagged[0] != ""): extracted.append(tagged[0]) importantwords = ', '.join(extracted) # print (importantwords) fdist = FreqDist(extracted) # print (fdist) # print (fdist.most_common(50)) rake = Rake("SmartStoplist.txt") keywords = rake.run(sentence) # print (keywords) for keyword in keywords: word = keyword[0] # print (word) response = requests.get('http://en.wikipedia.org/wiki/Led_Zeppelin') soup = BeautifulSoup(response.text) content = soup.find(id='mw-content-text')
from ensemble import apply_filters # from ensemble import ensemble_predictions num_cores = multiprocessing.cpu_count() chunk_size = 11000 core_id = 4 offset = 5230 preprocessed_lines = open("resources/data/social_text_tokenized_pos", "r").readlines() rake_data_lines = open("resources/data/social_text_tokenized", "r") \ .readlines() lemma_lines = open("resources/data/social_text_tokenized_lemma", "r").readlines() rake = Rake("resources/CombinedStopList") #rake_out_f = open("resources/data/social_text_tokenized_rake_filtered_" + str(core_id + 1), "a") topic_rank_out_f = open( "resources/data/social_text_tokenized_pos_topicrank_filtered_" + str(core_id + 1), "a") kpminer_out_f = open( "resources/data/social_text_tokenized_pos_kpminer_filtered_" + str(core_id + 1), "a") input_file_path = "resources/sample_jd_preprocessed" meta_file_path = "resources/sample_jd_meta_data" freq_unigrams = get_freq_words() new_preprocessed_lines = preprocessed_lines[ chunk_size * core_id +
def __init__(): self.model = Rake()
import pandas as pd,numpy as np import pdb from nltk import RegexpTokenizer import re from Tokenizers import SynonymTokenizer from Tokenizers import SynonymStemTokenizer from Tokenizers import StemTokenizer from nltk import word_tokenize from nltk import pos_tag from nltk.corpus import wordnet as wn from rake import Rake from nltk.stem import WordNetLemmatizer from nltk.tag.stanford import NERTagger rr = Rake() class AnswerProcessor(object): def __init__(self,query,answers,score=None): '''answers are a list of strings, query is a string,score is a list of scores for each answer ''' self.query=query self.answers=answers self.score = score #self.question_types = {'who':'PERSON','whom':'PERSON','whose':'PERSON','where':'LOCATION'\ # ,'when':('DURATION','DATE'),'how+adj/adv':'NUMBER','how long':'DURATION','how many':'NUMBER','how much':'NUMBER'} self.question_types = {'who':'PERSON','whom':'PERSON','whose':'PERSON','where':'LOCATION'\ ,'when':'CD','how+adj/adv':'CD','how long':'CD','how many':'CD','how much':'CD'} ###for what, next noun will be the thing we are searching for self.question_type=None self.query_tag=None self.answers_tag=None def stringProcessing(self,only_query=1): ''' query is a string, answers is a list of strings. returns tuples with tags, with a list covering '''