def extractKeywords(self): with open(self.name, 'r') as myfile: text = myfile.read().replace('\n', '') # print "text is ", text #using constraint where each keyword appears in text at least twice rake_object = rake.Rake("SmartStoplist.txt", 3, 3, 1) keywords = rake_object.run(text) print("keywords1 are ", keywords) #using constraint where each keyword appears in text at least three times rake_object = rake.Rake("SmartStoplist.txt", 3, 3, 2) keywords = rake_object.run(text) print("keywords2 are ", keywords)
def run_rake_standalone(stop_word_list, text): extractor = rake.Rake( stop_word_list ) # It uses SmartStoplist, you can choose FoxStopList instead keywords = extractor.run(text) return keywords
def search(): text = tbox1.get() print(text) stoppath = "data/stoplists/SmartStoplist.txt" rake_object = rake.Rake(stoppath, 5, 3, 4) keywords = rake_object.run(text) print("Keywords:", keywords) l = (len(keywords)) print(l) for i in keywords: x = i print(x[0]) try: from googlesearch import search except ImportError: print("No module named 'google' found") query = x[0] for j in search(query, tld="co.in", num=1, stop=1, pause=2): print(j) url = j values = {'s': 'basics', 'submit': 'search'} data = urllib.parse.urlencode(values) data = data.encode('utf-8') req = urllib.request.Request(url, data) resp = urllib.request.urlopen(req) respData = resp.read() paragraphs = re.findall(r'<p>(.*?)</p>', str(respData)) for eachP in paragraphs: print(eachP)
def keywords_from_tweet_list(self, tweet_list, num_characters=3, max_phrase=3, remove_repeats=False): """Uses RAKE to output the keywords from a tweet list. Args: tweet_list (list) : a block of tweets num_characters (int) : minimum number of characters in a keyword. Default is 3 max_phrase (int) : maximum amount of words in a keyword phrase. Default is 3. Returns: list : list of tuples containing the keyword and it's Rake score """ num_of_tweets = len(tweet_list) joined_tweets = ' '.join(itertools.chain(*tweet_list)) rake_object = rake.Rake("SmartStoplist.txt", num_characters, max_phrase, num_of_tweets / 1000) self.keywords = rake_object.run(joined_tweets) if remove_repeats: self.keywords = self.__remove_shorter_repeats(self.keywords) return self.keywords
def get_best_params(test_doc, test_set): best_fmeasure = 0 best_vals = [] for min_char_length in range(3, 8): for max_words_length in range(3, 6): for min_keyword_frequency in range(1, 7): rake_object = rake.Rake('SmartStoplist.txt', min_char_length, max_words_length, min_keyword_frequency) total_fmeasure = 0 keywords = rake_object.run(test_doc.text) num_manual_keywords = len(test_doc.keywords) correct = 0 try: for i in range(0, min(3, len(keywords))): if keywords[i][0] in set(test_doc.keywords): correct += 1 except IndexError: print('Problem with evaluating ', keywords) precision = correct / float(3) recall = correct / float(num_manual_keywords) if precision > 0 and recall > 0: total_fmeasure += 2 * precision * recall / (precision + recall) avg_fmeasure = round(total_fmeasure * 100 / float(len(test_set)), 2) if avg_fmeasure > best_fmeasure: best_fmeasure = avg_fmeasure best_vals = [min_char_length, max_words_length, min_keyword_frequency] return best_vals
def rake_phrase(self): """get phrases according to rake""" rake_object = rake.Rake(self.stoppath) phrase = rake_object.run(self.primary_text) self.phrase_rake_list = phrase[:self.number_of_keywords] print(f'phrases according to rake: {self.phrase_rake_list}') return self.phrase_rake_list
def extractPostKeywords(title, post): keywords = [] # Module used to mine text for keywords, parameters are: # word stop list, keyword min length, keyphrase min length, and keyword frequency rakeTextMiner = rake.Rake("SmartStoplist.txt", 3, 3, 1) textToAnalyze = title + " " + post # List of tuples (keyword, score) keywordsAndPhrases = rakeTextMiner.run(textToAnalyze) for wordOrPhraseTuple in keywordsAndPhrases: tokenizedPhrase = wordOrPhraseTuple[0].split(' ') phrase = tokenizedPhrase[-1] keywords.append(phrase) wordNumber = 0 for word in reversed(tokenizedPhrase): if wordNumber == 0: wordNumber += 1 continue keywords.append(word) phrase = word + '_' + phrase keywords.append(phrase) return keywords
def get_final_keywords(self, test_doc, test_set): """ Gets optimum parameters and initializes a rake object using them. Gets keywords and add them to the .key file under the same name as test_doc :param test_doc: document to extract keywords from :param test_set: set of documents """ # Gets optimum parameters for document best_params = optimize_rake.get_best_params(test_doc, test_set) # Initializes rake object using optimized parameters rake_object_final = rake.Rake('SmartStoplist.txt', best_params[0], best_params[1], best_params[2]) # Get keywords and opens .key file keywords = rake_object_final.run(test_doc.text) key_file = open(os.path.join(self.test_dir, test_doc.name + '.key'), 'w') # Add keywords to .key file along with scores for keyword in keywords: key = keyword[0] score = str(round(keyword[1], 1)) key_file.write(key + '.'*(30-len(key)) + score + '\n')
def extractkeywords(url): import rake, operator, re html = gethtml(url) if not html: return None urlkws = urltokw(url) brandfilter = brand(url) title = scraper(html, '//title/text()') description = scraper( html, "//meta[translate(@name, 'ABCDEFGHJIKLMNOPQRSTUVWXYZ', 'abcdefghjiklmnopqrstuvwxyz')='description']/@content" ) scrubbed = barebones(html) newtxt = "%s %s %s %s" % (title, description, scrubbed, urlkws) #newtxt = "%s %s %s %s" % (urlkws, title, description, scrubbed) newtxt = newtxt.replace('\n', ' ') newtxt = re.sub('<[^<]+?>', ' ', newtxt) newtxt = re.sub(' +', ' ', newtxt) rake_object = rake.Rake("/var/pipulate/SmartStoplist.txt", 3, 4, 2) keywords = rake_object.run(newtxt) stackum = '' for keyword in keywords: kw = keyword[0] candidate = kw.split() if len(candidate) > 1: if kw.replace(' ', '') != brandfilter.replace(' ', ''): stackum += keyword[0] + '\n' if stackum: return stackum else: return None
def RakeExtract(text, stoppath="SmartStoplist.txt"): """ :param text: string type :param stoppath: stopword list :return: a list of tuples where the 1st index is the keyword and the second index the score """ # Each word has at least 5 characters, each phrase has at most 3 words rake_object = rake.Rake(stoppath, 2, 2, 1) # Splits the text into sentences sentenceList = rake.split_sentences(text) stopwordpattern = rake.build_stop_word_regex(stoppath) # Generate Candidates phraseList = rake.generate_candidate_keywords(sentenceList, stopwordpattern) # print "Candidate Keywords: ", phraseList wordscores = rake.calculate_word_scores(phraseList) keywordcandidates = rake.generate_candidate_keyword_scores( phraseList, wordscores) sortedKeywords = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True) totalKeywords = len(sortedKeywords) keywords = sortedKeywords[0:(totalKeywords / 3)] return keywords
def get_keywords(text): rake_obj = rake.Rake( os.path.join(BASE_DIR, 'job_match', 'stop_words.txt'), 3, # at least xxx char in each keyword 5, # at most xxx words in the phrase 2) # at least appear xxx times return rake_obj.run(text)[0:3] # return first 3 keywords list
def get_keywords(paragraph): kw_result = [] rake_object = rake.Rake("SmartStoplist.txt", 3, 1, 1) keywords = rake_object.run(paragraph) for word in keywords: kw_result.append(word[0]) if(len(kw_result) == 1): break return kw_result
def main(): try: rake_object = rake.Rake("SmartStoplist.txt", 5, 3, 1) for line in sys.stdin: print(process(rake_object, line.strip())) except: print("FAIL! " + line, file=sys.stderr) raise
def get_keyword_confidence(message): rake_object = rake.Rake("SmartStoplist.txt", 3, 3, 1) keywords = rake_object.run(message) results = {} for tuple in keywords: if tuple[1] >= 3: results[tuple[0]] = round(tuple[1],1) return results
def rakes(raw_string): rake_object = rake.Rake( "/home/anubhav/Desktop/Maui Final/SmartStoplist.txt", 3, 2, 1) rakescore = {} keywords = rake_object.run(raw_string) for j in range(len(keywords)): rakescore[keywords[j][0]] = keywords[j][1] return rakescore
def autoTag(): abstract = request.values['abstract'] rake_object = rake.Rake("SmartStoplist.txt") print rake_object keywords = rake_object.run(abstract) shortenedPhrasesList = shortPhrases(keywords, 2) shortenedPhrasesList = zip(*sorted(shortenedPhrasesList, key=lambda arr: arr[1]))[0] return jsonify(keywords=shortenedPhrasesList[-min(len(shortenedPhrasesList),10):])
def findSubject(post): stoppath = "SmartStoplist.txt" # 1. initialize RAKE by providing a path to a stopwords file rake_object = rake.Rake(stoppath, 2, 2, 2) # 2. run on RAKE on a given text keywords = rake_object.run(post) return keywords
def getKeywords(self,content): rake_object = rake.Rake("SmartStoplist.txt",3,2) keywords = rake_object.run(content) keywordWeights = sorted(set([word[1] for word in keywords]),reverse=True) requiredCount = round(0.5*len(keywordWeights)) requiredWeights = keywordWeights[:requiredCount] requiredKeywords = [word[0] for word in keywords if word[1] in requiredWeights] return requiredKeywords
def process(user): API_KEY = ")e55ob6fBvCtSTibWPyP*A((" site = stackexchange.Site(stackexchange.StackOverflow, API_KEY, impose_throttling = True) uname = user user = site.user(user) question=request.form['question'] rake_object = rake.Rake("SmartStoplist.txt", 3, 5, 1) keywords = rake_object.run(question) print "keywords: ", keywords recent = site.recent_questions() return render_template('postfinal.html', user=user, site=site, keywords=keywords, question=question, recent=recent, uname = uname)
def execute_rake(text): stoppath = "StemStoplist.txt" rake_object = rake.Rake(stoppath,max_words_length=3) keywords = rake_object.run(text) max_value = max(keywords,key=lambda item:item[1])[1] normalized_keywords = [(word[0], word[1]/max_value) for word in keywords] normalized_keywords = sorted(normalized_keywords,key=lambda x: x[1],reverse=True) return normalized_keywords
def rake_keywords(self): """get keywords according to rake""" stoppath = "data/stoplists/SmartStoplist.txt" rake_object = rake.Rake(stoppath, 3, 3, 4) self.keywords_rake_list = rake_object.run(self.primary_text) try: self.keywords_rake_list = self.keywords_rake_list[:self. number_of_keywords] except: pass print("Keywords according to rake:", self.keywords_rake_list) return self.keywords_rake_list
def extractKeyWords(description): keyWords = [] text = "Senior Software Engineer at Continental Automotive Group February 2008 - Present (7 years 9 months) Windows application developement " stoppath = "expStopList.txt" rake_object = rake.Rake(stoppath, 3, 3, 1) results = rake_object.run(description) #print results resultsLen = len(results) for x in range(0, resultsLen): keyWords.append(results[x][0]) return keyWords
def keyword_tokenize(text): ''' INPUT: String OUTPUT: Tokenized String ''' stoppath = 'RAKE-tutorial/SmartStoplist.txt' rake_object = rake.Rake(stoppath, 5, 3, 4) sentenceList = rake.split_sentences(text) stopwordpattern = rake.build_stop_word_regex(stoppath) phraseList = rake.generate_candidate_keywords(sentenceList, stopwordpattern) wordscores = rake.calculate_word_scores(phraseList) return phraseList
def main(): #runRake() #compare() fn = sys.argv[1] for filename in os.listdir(dataDir): if fn in filename: filepath = os.path.join(dataDir, filename) rake_object = rake.Rake(stoppath, 5, 2, 2) sample_file = io.open(filepath, 'r') text = sample_file.read() keywords = rake_object.run(text) print(keywords) break
def getKeywords(text): ##new rake object rake_o = rake.Rake("SmartStoplist.txt", 2, 3, 4) ##filter out keywords keywords = rake_o.run(text) ##new array for keywords results = [""]*(len(keywords)) ##iterate over array to extract keywords from tuple for i in range(0, len(keywords)): results[i] = str(keywords[i][0]) return(results)
def make_vocab(data_file): vocab_counter = collections.Counter() extract = "" data_list = read_text_file(data_file) vocab_counter = collections.Counter() for idx, s in enumerate(data_list): if os.path.isfile(os.path.join(cnn_tokenized_stories_dir, s)): story_file = os.path.join(cnn_tokenized_stories_dir, s) elif os.path.isfile(os.path.join(dm_tokenized_stories_dir, s)): story_file = os.path.join(dm_tokenized_stories_dir, s) article_lines, abstract, word_extract_lines = get_art_abs(story_file) article = ' '.join([line for line in article_lines]) word_extract = ' '.join([line for line in word_extract_lines]) # article=SYB_RE.sub(r' ',article) article = _DIGIT_RE.sub(b" 0", article) # word_extract=SYB_RE.sub(r' ',word_extract) word_extract = _DIGIT_RE.sub(b" 0 ", word_extract) extract = extract + ' ' + word_extract art_tokens = article.split(' ') abs_tokens = abstract.split(' ') tokens = art_tokens + abs_tokens tokens = [t.strip() for t in tokens] # strip tokens = [t for t in tokens if t != ""] # remove empty vocab_counter.update(tokens) print "Writing vocab file..." with open(os.path.join(finished_files_dir, "vocab"), 'w') as writer: for word, count in vocab_counter.most_common(VOCAB_SIZE): writer.write(word + ' ' + str(count) + '\n') print "Finished writing vocab file" rake_object = rake.Rake("RAKE-tutorial/SmartStoplist.txt", 1, 1, 1) rake_keywords = rake_object.run(extract) keys = [k[0] for k in rake_keywords if not k[0].startswith('@entity')] keys = keys[:VOCAB_SIZE] print "Writing keyword file..." with open(os.path.join(finished_files_dir, "keyword"), 'w') as writer: for word in keys: # if word in set(vocab_counter.elements()): writer.write(word + '\n') print "Finished writing vocab file"
def findKeywords(company_news): stoppath = "../Data/data/stoplists/SmartStoplist.txt" rake_obj = rake.Rake(stoppath, 5, 3, 4) sample_file = io.open("../Data/data/docs/fao_test/w2167e.txt", 'r', encoding="iso-8859-1") text = sample_file.read() keywords = rake_obj.run(text) rake_obj = rake.Rake(stoppath) text = company_news sentences = rake.split_sentences(text) stop_words = rake.load_stop_words(stoppath) stop_pattern = rake.build_stop_word_regex(stoppath) phraseList = rake.generate_candidate_keywords(sentences, stop_pattern, stop_words) wscores = rake.calculate_word_scores(phraseList) keywordcandidates = rake.generate_candidate_keyword_scores( phraseList, wscores) keywords = sorted(six.iteritems(keywordcandidates), key=operator.itemgetter(1), reverse=True) totalKeywords = len(keywords) keyword_list = rake_obj.run(text)[0:10] keyword_list1 = [] for i in keyword_list: keyword_list1.append(i[0]) return keyword_list1
def generateAnswerKeywords(correctAnswer): rake_object = rake.Rake("RAKE-tutorial\SmartStoplist.txt", 3, 3, 1) answerKeywordsTupleList = rake_object.run(correctAnswer) answerKeywordsUnsplit = [] answerKeywords = [] for t in answerKeywordsTupleList: answerKeywordsUnsplit.append(t[0]) for k in answerKeywordsUnsplit: answerKeywords = answerKeywords + k.split() # print "correct answer is:" # print correctAnswer # print "correct answer keywords are:" # print answerKeywords return answerKeywords
def rake_call(final_string, page_no): min_chars = 5 max_words = 5 if page_no <= 10: min_freq = 2 elif page_no <= 30: min_freq = 3 elif page_no <= 100: min_freq = 4 else: min_freq = 8 rake_object = rake.Rake(stoppath, min_chars, max_words, min_freq) # print "Rake call: (stoppath, %s, %s, %s )" % (min_chars, max_words, min_freq) keywords = rake_object.run(final_string) return post_process(keywords)
def get_keyphrases(): stoppath = 'SmartStoplist.txt' filename = request.form['name'] surveys = pd.read_excel(filename, header=0) col_name = request.form['question'] group_by = request.form['group_by'] min_char_length = int(request.form['min_char_length']) min_words_length = int(request.form['min_words_length']) max_words_length = int(request.form['max_words_length']) min_keyword_frequency = int(request.form['min_keyword_frequency']) trade_off = float(request.form['trade_off']) top_n = int(request.form['top_n']) rake_object = rake.Rake(stoppath, min_char_length, min_words_length, max_words_length, min_keyword_frequency, 1, 3, 2) grouped_results = [] if group_by != '': for group in surveys[group_by].unique(): text = '' surveys_subset = surveys.loc[surveys[group_by] == group] col = surveys_subset[col_name] for i in col.index: text = text + " " + col[i] keywords_score, keywords_counts, stem_counts = rake_object.run( text, trade_off, top_n) grouped_results.append( (group, keywords_score, keywords_counts, stem_counts)) else: text = '' col = surveys[col_name] for i in col.index: text = text + " " + col[i] keywords_score, keywords_counts, stem_counts = rake_object.run( text, trade_off, top_n) grouped_results.append( ('Results:', keywords_score, keywords_counts, stem_counts)) return render_template("keyphrase_result.html", **{'context': grouped_results})