def get_keywords(art_sents, abs_sents): """ greedily match summary sentences to article sentences""" keywords = {} art = ' '.join(art_sents[1:]) abss = ' '.join(abs_sents) #3: RAKE keywords for each doc rake = RAKE.Rake(RAKE_STOPLIST, min_char_length=2, max_words_length=5) keywords["rake_art"] = rake.run(art) keywords["rake_abs"] = rake.run(abss) #4: TF-IDF keywords for processed text art_frequencies = {} abs_frequencies = {} document_count = 1 keywords["tfidf_freq_art"] = tfidf.get_word_frequencies(art) for word in keywords["tfidf_freq_art"]: art_frequencies.setdefault(word, 0) art_frequencies[word] += 1 keywords["tfidf_freq_abs"] = tfidf.get_word_frequencies(abss) for word in keywords["tfidf_freq_abs"]: abs_frequencies.setdefault(word, 0) abs_frequencies[word] += 1 sortby = lambda x: x[1]["score"] for word in keywords["tfidf_freq_art"].items(): word_frequency = word[1]["frequency"] docs_with_word = art_frequencies[word[0]] word[1]["score"] = tfidf.calculate(word_frequency, document_count, docs_with_word) sortby = lambda x: x[1]["score"] for word in keywords["tfidf_freq_abs"].items(): word_frequency = word[1]["frequency"] docs_with_word = abs_frequencies[word[0]] word[1]["score"] = tfidf.calculate(word_frequency, document_count, docs_with_word) keywords["tfidf_art"] = sorted(keywords["tfidf_freq_art"].items(), key=sortby, reverse=True) keywords["tfidf_abs"] = sorted(keywords["tfidf_freq_abs"].items(), key=sortby, reverse=True) #5. TextRank keywords['textrank_art'] = textrank.extractKeyphrases(art) keywords['textrank_abs'] = textrank.extractKeyphrases(abss) return keywords['rake_art'], keywords['rake_abs'], keywords[ 'tfidf_art'], keywords['tfidf_art'], keywords[ 'textrank_art'], keywords['textrank_art']
def textRank(): """ Calcuate the textRank of today's pastebin.com pastes at MongoDB, update the keywords to elk. """ loadConfig() client = MongoClient(mongo_host, mongo_port) elk = Elastic(elk_host, elk_port) db = client['pastebin'] pastebin = db.pastebin now = datetime.datetime.now() date = now.strftime("%Y-%m-%d") startdate = datetime.datetime.strptime(date, '%Y-%m-%d') print(startdate) cursor = pastebin.find({"@timestamp": { "$gt": startdate }}) print("article count: %d" % cursor.count()) if cursor.count() == 0: print('Today %s has no data to analysis.' % (date)) sys.exit(0) article = cursor[:] for row in article: try: text = row['text'] keyphrases = extractKeyphrases(text, keyWordNum) jbody = {"doc": {"tfidf": keyphrases}} elk.update2elk(elk_index, elk_type, row['pid'], jbody) except: print("pastebin textRank error.") continue print("\nComplete!")
def nouse(): #3: RAKE keywords for each page print("=== 3. RAKE") rake = RAKE.Rake(RAKE_STOPLIST, min_char_length=2, max_words_length=5) for page in processed_pages: page["rake_results"] = rake.run(page["text"]) print("RAKE: %d" % (time.time() - start_time)) #4: TF-IDF keywords for processed text print("=== 4. TF-IDF") document_frequencies = {} document_count = len(processed_pages) for page in processed_pages: page["tfidf_frequencies"] = tfidf.get_word_frequencies(page["text"]) for word in page["tfidf_frequencies"]: document_frequencies.setdefault(word, 0) document_frequencies[word] += 1 sortby = lambda x: x[1]["score"] for page in processed_pages: for word in page["tfidf_frequencies"].items(): word_frequency = word[1]["frequency"] docs_with_word = document_frequencies[word[0]] word[1]["score"] = tfidf.calculate(word_frequency, document_count, docs_with_word) page["tfidf_results"] = sorted(page["tfidf_frequencies"].items(), key=sortby, reverse=True) print("TF-IDF: %d" % (time.time() - start_time)) #5. TextRank print("=== 5. TextRank") for page in processed_pages: textrank_results = textrank.extractKeyphrases(page["text"]) page["textrank_results"] = sorted(textrank_results.items(), key=lambda x: x[1], reverse=True) print("TextRank: %d" % (time.time() - start_time)) #6. Results print("=== 6. Results") for page in processed_pages: print("-------------------------") print("URL: %s" % page["url"]) print("RAKE:") for result in page["rake_results"][:5]: print(" * %s" % result[0]) print("TF-IDF:") for result in page["tfidf_results"][:5]: print(" * %s" % result[0]) print("TextRank:") for result in page["textrank_results"][:5]: print(" * %s" % result[0]) end_time = time.time() - start_time print('Done. Elapsed: %d' % end_time)
def textrank_kw(data, field='description'): print 'Extracting textrank keywords' kw_field = '%s_textrank_kw' % field for i, d in enumerate(data): if i and i % 1000 == 0: print '\t', i keywords = extractKeyphrases(d[field]) keywords = map(string.upper, keywords) d[kw_field] = list(set(keywords)) return data
def textrank_analysis(peak_tweets, orig_tag=''): """ Takes a list of lists of tweets, with each sublist being tweets within a peak Concatenate & clean tweets. Returns a list of #1 keyword for each sublist determined by TextRank. """ textrank_kw = [] for peak in peak_tweets: text = '.'.join(peak) text = clean_tweet(text, orig_tag) peak_textrank = textrank.extractKeyphrases(text) textrank_keys = sorted(peak_textrank.items(), key=lambda x: x[1], reverse=True)[0][0] textrank_kw.append(textrank_keys) return textrank_kw
def parseHtml(iters, category, match, i): matchlink = '' title = '' date = '' content = '' keyword = '' try: matchlink = match.find(name='a').get('href') link = 'https://www.allsingaporestuff.com' + matchlink except: print(BackColors.WARNING + 'The ' + str(iters) + 'th Page ' + str(i) + 'th link Error' + BackColors.ENDC) link = '' if matchlink: contentHtml = requests.get(url=link, headers=RequestHeader.browserHeader) contentSoup = BeautifulSoup(contentHtml.text, 'lxml') date = contentSoup.find_all(name='span', attrs={'property': 'dc:date dc:created'})[0]['content'] title = contentSoup.find_all( attrs={'property': 'og:title'})[0]['content'] content = contentSoup.find_all( attrs={'name': 'twitter:description'})[0]['content'] keywords = extractSentences(content) keywords = extractKeyphrases(keywords) lettersOnly = re.sub("[^a-zA-Z]", " ", " ".join(keywords)) lowerCase = lettersOnly.lower() words = lowerCase.split() cachedstopwords = open(Paths.textPath + 'stopwords.txt').read() stopwords = cachedstopwords.split('\n') words = [w for w in words if w not in stopwords] keyword = " ".join(words) d = AllSingaporeStuffDateParse(date) data = (iters, title, link, category, keyword, d, content) if matchlink and keyword and title and content: insertData(data)
def parseHtml(iters, category, match, i): matchlink = '' title = '' date = '' content = '' keyword = '' try: matchlink = match.find(name='h3').find(name='a').get('href') link = matchlink except: print(BackColors.WARNING + 'The ' + str(iters) + 'th Page ' + str(i) + 'th link Error' + BackColors.ENDC) link = '' try: date = match.find(name='div', attrs={'class', 'post-date'}).get_text() except: date = "" d = MotherShipDateParse(date) if matchlink: contentHtml = requests.get(url=link, headers=RequestHeader.browserHeader) contentSoup = BeautifulSoup(contentHtml.text, 'lxml') title = contentSoup.find(name='h1', attrs={'class', 'content-title'}).get_text() content = contentSoup.find(name='div', attrs={'class', 'post-content'}).get_text() keywords = extractSentences(content) keywords = extractKeyphrases(keywords) lettersOnly = re.sub("[^a-zA-Z]", " ", " ".join(keywords)) lowerCase = lettersOnly.lower() words = lowerCase.split() cachedstopwords = open(Paths.textPath + 'stopwords.txt').read() stopwords = cachedstopwords.split('\n') words = [w for w in words if w not in stopwords] keyword = " ".join(words[:25]) data = (iters, title, link, category, keyword, d, lettersOnly) if matchlink and keyword and title and content: insertData(data)
def watching_stories(domain_list): """ watching stories of competitors :param domain_list: targeted competitors domain names :return: """ fetch = fetch_time_line() db_category_list = __category_service.find_all_categories() db_interest_list = __interest_service.find_all_interests() for domain in domain_list: r = requests.get("https://api.newswhip.com/v1/publisher/" + domain + "/1?key="+newswhip_key) response = json.loads(r.text) logging.info("Domain: " + domain + " & No of Articles: " + str(len(response['articles']))) for item in response['articles']: try: article_info = domparser.element_picker(item['link'].encode('utf-8')) if article_info['title'] is not None or article_info['feature_image'] is not None or article_info['url'] is not None: article = {'title': '', 'url': '', 'description': '', 'keywords': '', 'feature_image': '','New_score': '', 'max_new_score': '', 'fb_like': '', 'tweet_count': '', 'publisher': '', "uuid": '', 'published': '', 'category': [], 'interest': [], 'fetch': '', 'created_keys':[]} if item['headline'] is None: article['title'] = article_info['title'] else: article['title'] = item['headline'].encode('utf-8') if item['link'] is None: article['url'] = article_info['url'] else: article['url'] = item['link'].encode('utf-8') if item['excerpt'] is None: article['description'] = article_info['description'] else: article['description'] = item['excerpt'] if item['keywords'] is None: article['keywords'] = article_info['keywords'] else: article['keywords'] = (item['keywords']).split(',') if item['image_link'] is None: article['feature_image'] = article_info['feature_image'] else: article['feature_image'] = item['image_link'] if 'new_score' in item: article['New_score'] = item['nw_score'] else: article['New_score'] = 0 if 'max_new_score' in item: article['max_new_score'] = item['max_nw_score'] else: article['max_new_score'] = 0 if 'total_engagement_count' in item['fb_data']: article['fb_like'] = item['fb_data']['total_engagement_count'] else: article['fb_like'] = 0 if 'tw_count' in item['tw_data']: article['tweet_count'] = item['tw_data']['tw_count'] else: item['tw_data']['tw_count'] = 0 if 'publisher' in item['source']: article['publisher'] = item['source']['publisher'] else: article['publisher'] = "None" if 'uuid' in item: article['uuid'] = item['uuid'] else: article['uuid'] = 'None' if 'publication_timestamp' in item: article['published'] = time.strftime('%Y-%m-%d %H:%M', time.localtime(item['publication_timestamp']/1000.0)) else: article['published'] = "None" article['fetch'] = current_epoch_time(datetime.now()) dummy_category = [] for i in article_info['category']: split_list = i.split(',') for itr in split_list: if itr not in dummy_category: dummy_category.append(itr.lower()) article_info['category'] = dummy_category if not any(category['category'] in article_info['category'] for category in db_category_list): for category_item in article_info['category']: for interest in db_interest_list: if category_item == interest['interest']: if category_item not in article['interest']: article['interest'].append(category_item) if len(article['interest']) <= 0: article['category'] = article_info['category'] else: article['category'] = [] for int_item in article['interest']: current_interest = filter(lambda member:int_item == member['interest'], db_interest_list) if len(current_interest) == 1: current_category = filter(lambda member: current_interest[0]['category_id'] == member['_id'], db_category_list) if len(current_category) == 1: article['category'].append(current_category[0]['category']) else: if article['keywords'] is not None: (article['interest'], return_category_ids) = checking_interest(article['keywords']) article['category'] = article_info['category'] key_phrases_list = [] raw_key_phrases_list = [] interest_category_id = [] if article_info['keywords']: keywords_key_phrases = (''.join(map(str, ((article_info['keywords'][0]).decode('ascii', 'ignore')).lower()))).split(",") key_phrases_list += keywords_key_phrases raw_key_phrases_list += keywords_key_phrases if article_info['title']: title_key_phrases = extractKeyphrases(article_info['title'].decode('ascii', 'ignore')) key_phrases_list += list(title_key_phrases) raw_key_phrases_list.append(str(article_info['title'].decode('ascii', 'ignore'))) if article_info['description']: description_key_phrases = extractKeyphrases(article_info['description'].decode('ascii', 'ignore')) key_phrases_list += list(description_key_phrases) raw_key_phrases_list.append(str(article_info['description'].decode('ascii', 'ignore'))) d = Counter(key_phrases_list) keys_to_remove = ['', ' ', '%', 'an', 'a', ',', 'ii', 'r', 'so', 'is', 'in', 'the', 'nbt', 'us', 'them', 's', '|', 'eisamay', 'navbharat', '-navbharat', 'navbharat times', 'samay', 'india'] refactor_key_list = [] for key in list(d.keys()): if (key.strip()).lower() not in keys_to_remove and (key.strip()).lower() not in refactor_key_list: refactor_key_list.append((key.strip()).lower()) article['created_keys'] = refactor_key_list if article['created_keys'] is not None: (created_interest, interest_category_id) = checking_interest(raw_key_phrases_list) if created_interest is not None: article['interest'] += created_interest if interest_category_id is not None: cat_dict = Counter(interest_category_id) top_order_category = '' top = 0 for index, cat_item in enumerate(cat_dict.keys()): if cat_dict[cat_dict.keys()[index]] >= top: top_order_category = cat_dict.keys()[index] top = cat_dict[cat_dict.keys()[index]] if top_order_category: supposed_category = __category_service.find_category(top_order_category) article['category'].append(supposed_category['category']) if article['interest']: article['status'] = True else: article['status'] = False __story_service.save_story(article) __fetch_service.save_fetch(fetch) except Exception as ex: logging.info("Runtime Error: " + ex)
def textrank_parallel(text): keywords = extractKeyphrases(text) keywords = map(string.upper, keywords) return list(set(keywords))
def execute(cleanse_method, pages): """Execute RAKE and TF-IDF algorithms on each page and output top scoring phrases""" start_time = time.time() #1: Initialize a URL reader with local caching to be kind to the internet print("=== 1. Initialize") reader = contentloader.CacheableReader(CACHE_FOLDER, cleanse_method) print("Initialized: %d" % (time.time() - start_time)) #2: Collect raw text for pages print("=== 2. Collect Raw Text") processed_pages = [] for page in pages: page_text = reader.get_site_text(page) processed_pages.append({"url": page, "text": page_text}) print("Collected: %d" % (time.time() - start_time)) #3: RAKE keywords for each page print("=== 3. RAKE") rake = RAKE.Rake(RAKE_STOPLIST, min_char_length=2, max_words_length=5) for page in processed_pages: page["rake_results"] = rake.run(page["text"]) print("RAKE: %d" % (time.time() - start_time)) #4: TF-IDF keywords for processed text print("=== 4. TF-IDF") document_frequencies = {} document_count = len(processed_pages) for page in processed_pages: page["tfidf_frequencies"] = tfidf.get_word_frequencies(page["text"]) for word in page["tfidf_frequencies"]: document_frequencies.setdefault(word, 0) document_frequencies[word] += 1 sortby = lambda x: x[1]["score"] for page in processed_pages: for word in page["tfidf_frequencies"].items(): word_frequency = word[1]["frequency"] docs_with_word = document_frequencies[word[0]] word[1]["score"] = tfidf.calculate(word_frequency, document_count, docs_with_word) page["tfidf_results"] = sorted(page["tfidf_frequencies"].items(), key=sortby, reverse=True) print("TF-IDF: %d" % (time.time() - start_time)) #5. TextRank print("=== 5. TextRank") for page in processed_pages: textrank_results = textrank.extractKeyphrases(page["text"]) page["textrank_results"] = sorted(textrank_results.items(), key=lambda x: x[1], reverse=True) print("TextRank: %d" % (time.time() - start_time)) #6. Results print("=== 6. Results") for page in processed_pages: print("-------------------------") print("URL: %s" % page["url"]) print("RAKE:") for result in page["rake_results"][:5]: print(" * %s" % result[0]) print("TF-IDF:") for result in page["tfidf_results"][:5]: print(" * %s" % result[0]) print("TextRank:") for result in page["textrank_results"][:5]: print(" * %s" % result[0]) end_time = time.time() - start_time print('Done. Elapsed: %d' % end_time)
def watching_stories(domain_list): """ watching stories of competitors :param domain_list: targeted competitors domain names :return: """ fetch = fetch_time_line() db_category_list = __category_service.find_all_categories() db_interest_list = __interest_service.find_all_interests() for domain in domain_list: r = requests.get("https://api.newswhip.com/v1/publisher/" + domain + "/1?key=" + newswhip_key) response = json.loads(r.text) logging.info("Domain: " + domain + " & No of Articles: " + str(len(response['articles']))) for item in response['articles']: try: article_info = domparser.element_picker( item['link'].encode('utf-8')) if article_info['title'] is not None or article_info[ 'feature_image'] is not None or article_info[ 'url'] is not None: article = { 'title': '', 'url': '', 'description': '', 'keywords': '', 'feature_image': '', 'New_score': '', 'max_new_score': '', 'fb_like': '', 'tweet_count': '', 'publisher': '', "uuid": '', 'published': '', 'category': [], 'interest': [], 'fetch': '', 'created_keys': [] } if item['headline'] is None: article['title'] = article_info['title'] else: article['title'] = item['headline'].encode('utf-8') if item['link'] is None: article['url'] = article_info['url'] else: article['url'] = item['link'].encode('utf-8') if item['excerpt'] is None: article['description'] = article_info['description'] else: article['description'] = item['excerpt'] if item['keywords'] is None: article['keywords'] = article_info['keywords'] else: article['keywords'] = (item['keywords']).split(',') if item['image_link'] is None: article['feature_image'] = article_info[ 'feature_image'] else: article['feature_image'] = item['image_link'] if 'new_score' in item: article['New_score'] = item['nw_score'] else: article['New_score'] = 0 if 'max_new_score' in item: article['max_new_score'] = item['max_nw_score'] else: article['max_new_score'] = 0 if 'total_engagement_count' in item['fb_data']: article['fb_like'] = item['fb_data'][ 'total_engagement_count'] else: article['fb_like'] = 0 if 'tw_count' in item['tw_data']: article['tweet_count'] = item['tw_data']['tw_count'] else: item['tw_data']['tw_count'] = 0 if 'publisher' in item['source']: article['publisher'] = item['source']['publisher'] else: article['publisher'] = "None" if 'uuid' in item: article['uuid'] = item['uuid'] else: article['uuid'] = 'None' if 'publication_timestamp' in item: article['published'] = time.strftime( '%Y-%m-%d %H:%M', time.localtime(item['publication_timestamp'] / 1000.0)) else: article['published'] = "None" article['fetch'] = current_epoch_time(datetime.now()) dummy_category = [] for i in article_info['category']: split_list = i.split(',') for itr in split_list: if itr not in dummy_category: dummy_category.append(itr.lower()) article_info['category'] = dummy_category if not any(category['category'] in article_info['category'] for category in db_category_list): for category_item in article_info['category']: for interest in db_interest_list: if category_item == interest['interest']: if category_item not in article[ 'interest']: article['interest'].append( category_item) if len(article['interest']) <= 0: article['category'] = article_info['category'] else: article['category'] = [] for int_item in article['interest']: current_interest = filter( lambda member: int_item == member[ 'interest'], db_interest_list) if len(current_interest) == 1: current_category = filter( lambda member: current_interest[0][ 'category_id'] == member['_id'], db_category_list) if len(current_category) == 1: article['category'].append( current_category[0]['category']) else: if article['keywords'] is not None: (article['interest'], return_category_ids) = checking_interest( article['keywords']) article['category'] = article_info['category'] key_phrases_list = [] raw_key_phrases_list = [] interest_category_id = [] if article_info['keywords']: keywords_key_phrases = (''.join( map(str, ((article_info['keywords'][0]).decode( 'ascii', 'ignore')).lower()))).split(",") key_phrases_list += keywords_key_phrases raw_key_phrases_list += keywords_key_phrases if article_info['title']: title_key_phrases = extractKeyphrases( article_info['title'].decode('ascii', 'ignore')) key_phrases_list += list(title_key_phrases) raw_key_phrases_list.append( str(article_info['title'].decode( 'ascii', 'ignore'))) if article_info['description']: description_key_phrases = extractKeyphrases( article_info['description'].decode( 'ascii', 'ignore')) key_phrases_list += list(description_key_phrases) raw_key_phrases_list.append( str(article_info['description'].decode( 'ascii', 'ignore'))) d = Counter(key_phrases_list) keys_to_remove = [ '', ' ', '%', 'an', 'a', ',', 'ii', 'r', 'so', 'is', 'in', 'the', 'nbt', 'us', 'them', 's', '|', 'eisamay', 'navbharat', '-navbharat', 'navbharat times', 'samay', 'india' ] refactor_key_list = [] for key in list(d.keys()): if (key.strip()).lower() not in keys_to_remove and ( key.strip()).lower() not in refactor_key_list: refactor_key_list.append((key.strip()).lower()) article['created_keys'] = refactor_key_list if article['created_keys'] is not None: (created_interest, interest_category_id ) = checking_interest(raw_key_phrases_list) if created_interest is not None: article['interest'] += created_interest if interest_category_id is not None: cat_dict = Counter(interest_category_id) top_order_category = '' top = 0 for index, cat_item in enumerate(cat_dict.keys()): if cat_dict[cat_dict.keys()[index]] >= top: top_order_category = cat_dict.keys()[index] top = cat_dict[cat_dict.keys()[index]] if top_order_category: supposed_category = __category_service.find_category( top_order_category) article['category'].append( supposed_category['category']) if article['interest']: article['status'] = True else: article['status'] = False __story_service.save_story(article) __fetch_service.save_fetch(fetch) except Exception as ex: logging.info("Runtime Error: " + ex)
#构建图计算 textrank nx_graph = nx.from_scipy_sparse_matrix(similarity_graph) scores = nx.pagerank(nx_graph) return sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) if __name__ == '__main__': document = """To Sherlock Holmes she is always the woman. I have seldom heard him mention her under any other name. In his eyes she eclipses and predominates the whole of her sex. It was not that he felt any emotion akin to love for Irene Adler. All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind. He was, I take it, the most perfect reasoning and observing machine that the world has seen, but as a lover he would have placed himself in a false position. He never spoke of the softer passions, save with a gibe and a sneer. They were admirable things for the observer-excellent for drawing the veil from men’s motives and actions. But for the trained reasoner to admit such intrusions into his own delicate and finely adjusted temperament was to introduce a distracting factor which might throw a doubt upon all his mental results. Grit in a sensitive instrument, or a crack in one of his own high-power lenses, would not be more disturbing than a strong emotion in a nature such as his. And yet there was but one woman to him, and that woman was the late Irene Adler, of dubious and questionable memory. """ TextRank(document) print(textrank.extractKeyphrases(document))
def execute(pages): """Execute RAKE and TF-IDF algorithms on each page and output top scoring phrases""" start_time = time.time() #2: Collect raw text for pages print("=== 2. Collect Raw Text from file") text = "" f = open(pages[0], "r") for line in f: #line = line.strip("\r") #line = line.strip("\n") text += line.lower() processed_pages = [] for page in pages: page_text = text processed_pages.append({"url": pages[0], "text": page_text}) print("Collected: %d" % (time.time() - start_time)) #3: RAKE keywords for each page print("=== 3. RAKE") rake = RAKE.Rake(RAKE_STOPLIST, min_char_length=2, max_words_length=1) for page in processed_pages: page["rake_results"] = rake.run(page["text"]) print("RAKE: %d" % (time.time() - start_time)) #4: TF-IDF keywords for processed text print("=== 4. TF-IDF") document_frequencies = {} document_count = len(processed_pages) for page in processed_pages: page["tfidf_frequencies"] = tfidf.get_word_frequencies(page["text"]) for word in page["tfidf_frequencies"]: document_frequencies.setdefault(word, 0) document_frequencies[word] += 1 sortby = lambda x: x[1]["score"] for page in processed_pages: for word in page["tfidf_frequencies"].items(): word_frequency = word[1]["frequency"] docs_with_word = document_frequencies[word[0]] word[1]["score"] = tfidf.calculate(word_frequency, document_count, docs_with_word) page["tfidf_results"] = sorted(page["tfidf_frequencies"].items(), key=sortby, reverse=True) print("TF-IDF: %d" % (time.time() - start_time)) #5. TextRank print("=== 5. TextRank") for page in processed_pages: textrank_results = textrank.extractKeyphrases(page["text"]) page["textrank_results"] = sorted(textrank_results.items(), key=lambda x: x[1], reverse=True) print("TextRank: %d" % (time.time() - start_time)) #6. Results print("=== 6. Results") for page in processed_pages: print("-------------------------") print("URL: %s" % page["url"]) print("RAKE:") for result in page["rake_results"][:5]: print(" * %s" % result[0], result[1]) print("TF-IDF:") for result in page["tfidf_results"][:5]: print(" * %s" % result[0], result[1]) print("TextRank:") for result in page["textrank_results"][:5]: print(" * %s" % result[0], result[1]) end_time = time.time() - start_time print('Done. Elapsed: %d' % end_time)