def get_article_info(url): try: a = Article(url, fetch_images=False) a.download() a.parse() # Not doing a.nlp() to be more efficient. text_keyws = list(nlp.keywords(a.text).keys()) title_keyws = list(nlp.keywords(a.title).keys()) keyws = list(set(title_keyws + text_keyws)) if 'published_time' in a.meta_data['article']: published_time = a.meta_data['article']['published_time'] else: published_time = '' return {'keywords': keyws, 'c_link': a.canonical_link, 'published_time': published_time, 'title': a.title} except: return {'keywords': [], 'c_link': a.canonical_link, 'published_time': '', 'title': ''}
def _new_summarize(text='', max_sents=5): summaries = [] sentences = split_sentences(text) keys = keywords(text) # Score sentences, and use the top 5 or max_sents sentences ranks = nlp.score(sentences, keys).most_common(max_sents) for rank in ranks: summaries.append(rank[0]) summaries.sort(key=lambda summary: summary[0]) return [summary[1] for summary in summaries]
def ProcessArticle(urlStr, domain, htmlStr, cursor): config = Configuration() extractor = ContentExtractor(config) clean_doc = config.get_parser().fromstring(htmlStr) title = extractor.get_title(clean_doc) authors = extractor.get_authors(clean_doc) text = fulltext(htmlStr) text_keyws = list(nlp.keywords(text).keys()) title_keyws = list(nlp.keywords(title).keys()) keyws = list(set(title_keyws + text_keyws)) summary_sents = nlp.summarize(title=title, text=text, max_sents=config.MAX_SUMMARY_SENT) summary = '\n'.join(summary_sents) if len(text) == 0: OnArticleProcessError(urlStr) else: StoreToDatabase(urlStr, domain, title, authors, text, keyws, summary, cursor)
def _new_summarize( text='', max_sents=5): summaries = [] sentences = split_sentences(text) keys = keywords(text) # Score sentences, and use the top 5 or max_sents sentences ranks = nlp.score(sentences, keys).most_common(max_sents) for rank in ranks: summaries.append(rank[0]) summaries.sort(key=lambda summary: summary[0]) return [summary[1] for summary in summaries]
def evaluate_newspaper_summary(self, title, text, sentences, language): # get newspaper's nlp scores # https://github.com/codelucas/newspaper/blob/master/newspaper/article.py#L372 nlp.load_stopwords(language) # call to: nlp.summarize(title=article.title, text=article.text, max_sents=max_sents) # https://github.com/codelucas/newspaper/blob/master/newspaper/nlp.py#L40 title_words = nlp.split_words(title) most_frequent = nlp.keywords(text) nlp_scores = self.normalize_scores( nlp.score(sentences, title_words, most_frequent)) # Return a dictionary of tuple<sentence index, setence text> to score # ie. { (0, 'A new poll suggests that the Toronto Raptors...') : 0.144, ... } return nlp_scores
filename='teesta_setal_input.csv' currentdirpath=os.getcwd() file_path=os.path.join(os.getcwd(),filename) with open(file_path,'r+' ) as reports_file: reader = csv.reader(reports_file, delimiter='\t') for row in reader: article = row[0] summary_sentents=nlp.summarize(text = article) summary = '\n'.join(summary_sentents) row.append(summary) textkeys=keywords(article).keys() row.append(textkeys) with open((os.path.join(os.getcwd(),'final_tee.csv')),'a') as newfile: writer = csv.writer(newfile, delimiter='\t') try: writer.writerow(row) except Exception, e: pass
data = [] filename = 'teesta_setal_input.csv' currentdirpath = os.getcwd() file_path = os.path.join(os.getcwd(), filename) with open(file_path, 'r+') as reports_file: reader = csv.reader(reports_file, delimiter='\t') for row in reader: article = row[0] summary_sentents = nlp.summarize(text=article) summary = '\n'.join(summary_sentents) row.append(summary) textkeys = keywords(article).keys() row.append(textkeys) with open((os.path.join(os.getcwd(), 'final_tee.csv')), 'a') as newfile: writer = csv.writer(newfile, delimiter='\t') try: writer.writerow(row) except Exception, e: pass
def text_keywords(): text_to_clean = request.args.get('text_to_clean') keywords = nlp.keywords(text_to_clean) return jsonify(keywords)
article.top_image = '' if re.search( 'favicon', article.top_image) else article.top_image with open('../detector/tl_stopwords.txt', 'r') as f: TL_STOPWORDS = f.read().splitlines() STOP_WORDS = ENGLISH_STOP_WORDS.union(TL_STOPWORDS) cleaned_body = ' '.join([ word for word in body.split() if word.lower() not in STOP_WORDS ]) cleaned_title = ' '.join([ word for word in title.split() if word.lower() not in STOP_WORDS ]) text_keyws = list(keywords(cleaned_body).keys()) title_keyws = list(keywords(cleaned_title).keys()) keyws = list(set(title_keyws + text_keyws)) summary = summarize(title=article.title, text=body, max_sents=3) # keywords = [] # for key, value in article.keywords.items(): # keywords.append({ # 'word': key, # 'score': value # }) # keywords = sorted(
def nlp_wrapper(text): """Keyword extraction wrapper """ nlp.load_stopwords('en') return ' '.join(list(nlp.keywords(text).keys()))
def summarize(self, html, percent_sentences): if (percent_sentences is None or percent_sentences > 100 or percent_sentences < 0): percent_sentences = 15 article = self.process_html(html) # remove title from the text, if it appears in the text if article.text.startswith(article.title): article.set_text(article.text[len(article.title):]) sentences = nlp.split_sentences(article.text) log.debug(article.text) # remove punctuations, numbers and special characters clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ") clean_sentences = [s.lower() for s in clean_sentences] clean_sentences = [ self._remove_stopwords(r.split()) for r in clean_sentences ] # create sentence vectors sentence_vectors = [] for i in clean_sentences: if len(i) != 0: v = sum([ self.word_embeddings.get(w, np.zeros((300, ))) for w in i.split() ]) / (len(i.split()) + 0.001) else: v = np.zeros((300, )) sentence_vectors.append(v) # similarity matrix sim_mat = np.zeros([len(sentences), len(sentences)]) # initialize matrix for i in range(len(sentences)): for j in range(len(sentences)): if i != j: sim_mat[i][j] = cosine_similarity( sentence_vectors[i].reshape(1, 300), sentence_vectors[j].reshape(1, 300), )[0, 0] # convert matrix into graph nx_graph = nx.from_numpy_array(sim_mat) textrank_scores = self.normalize_scores(nx.pagerank(nx_graph)) # get newspaper's nlp scores # https://github.com/codelucas/newspaper/blob/master/newspaper/article.py#L372 nlp.load_stopwords(article.config.get_language()) # call to: nlp.summarize(title=article.title, text=article.text, max_sents=max_sents) # https://github.com/codelucas/newspaper/blob/master/newspaper/nlp.py#L40 title_words = nlp.split_words(article.title) most_frequent = nlp.keywords(article.text) nlp_scores = self.normalize_scores( nlp.score(sentences, title_words, most_frequent)) totalled_scores = Counter() for key, value in nlp_scores.items(): totalled_scores[key[0]] += value for key, value in textrank_scores.items(): totalled_scores[key] += value num_sentences = int(len(clean_sentences) * percent_sentences / 100) sentence_indices = list( map(lambda x: x[0], totalled_scores.most_common(num_sentences))) return list(map(lambda x: sentences[x], sentence_indices))