示例#1
0
def get_article_info(url):
    try:
        a = Article(url, fetch_images=False)
        a.download()
        a.parse()
        # Not doing a.nlp() to be more efficient.
        text_keyws = list(nlp.keywords(a.text).keys())
        title_keyws = list(nlp.keywords(a.title).keys())
        keyws = list(set(title_keyws + text_keyws))

        if 'published_time' in a.meta_data['article']:
            published_time = a.meta_data['article']['published_time']
        else:
            published_time = ''
        return {'keywords': keyws, 'c_link': a.canonical_link, 'published_time': published_time, 'title': a.title}
    except:
        return {'keywords': [], 'c_link': a.canonical_link, 'published_time': '', 'title': ''}
示例#2
0
def _new_summarize(text='', max_sents=5):

    summaries = []
    sentences = split_sentences(text)
    keys = keywords(text)

    # Score sentences, and use the top 5 or max_sents sentences
    ranks = nlp.score(sentences, keys).most_common(max_sents)
    for rank in ranks:
        summaries.append(rank[0])
    summaries.sort(key=lambda summary: summary[0])
    return [summary[1] for summary in summaries]
示例#3
0
def ProcessArticle(urlStr, domain, htmlStr, cursor):
    config = Configuration()
    extractor = ContentExtractor(config)
    clean_doc = config.get_parser().fromstring(htmlStr)
    title = extractor.get_title(clean_doc)
    authors = extractor.get_authors(clean_doc)
    text = fulltext(htmlStr)

    text_keyws = list(nlp.keywords(text).keys())
    title_keyws = list(nlp.keywords(title).keys())

    keyws = list(set(title_keyws + text_keyws))
    summary_sents = nlp.summarize(title=title,
                                  text=text,
                                  max_sents=config.MAX_SUMMARY_SENT)
    summary = '\n'.join(summary_sents)

    if len(text) == 0:
        OnArticleProcessError(urlStr)
    else:
        StoreToDatabase(urlStr, domain, title, authors, text, keyws, summary,
                        cursor)
def _new_summarize( text='', max_sents=5):
    
    

    summaries = []
    sentences = split_sentences(text)
    keys = keywords(text)
 

    # Score sentences, and use the top 5 or max_sents sentences
    ranks = nlp.score(sentences, keys).most_common(max_sents)
    for rank in ranks:
        summaries.append(rank[0])
    summaries.sort(key=lambda summary: summary[0])
    return [summary[1] for summary in summaries]
示例#5
0
    def evaluate_newspaper_summary(self, title, text, sentences, language):
        # get newspaper's nlp scores
        # https://github.com/codelucas/newspaper/blob/master/newspaper/article.py#L372
        nlp.load_stopwords(language)

        # call to: nlp.summarize(title=article.title, text=article.text, max_sents=max_sents)
        # https://github.com/codelucas/newspaper/blob/master/newspaper/nlp.py#L40
        title_words = nlp.split_words(title)
        most_frequent = nlp.keywords(text)

        nlp_scores = self.normalize_scores(
            nlp.score(sentences, title_words, most_frequent))

        # Return a dictionary of tuple<sentence index, setence text> to score
        # ie. { (0, 'A new poll suggests that the Toronto Raptors...') : 0.144, ... }
        return nlp_scores
filename='teesta_setal_input.csv'
currentdirpath=os.getcwd()
file_path=os.path.join(os.getcwd(),filename)


with open(file_path,'r+'  ) as reports_file:
  reader = csv.reader(reports_file, delimiter='\t')
  for row in reader:


    article = row[0]
    summary_sentents=nlp.summarize(text = article)

    
    summary = '\n'.join(summary_sentents)
   
    row.append(summary)
    
    textkeys=keywords(article).keys()

    row.append(textkeys)
    
    with open((os.path.join(os.getcwd(),'final_tee.csv')),'a') as newfile:
      writer = csv.writer(newfile, delimiter='\t')
      try:
        writer.writerow(row)

        
      except Exception, e:
        pass
      
示例#7
0
data = []

filename = 'teesta_setal_input.csv'
currentdirpath = os.getcwd()
file_path = os.path.join(os.getcwd(), filename)

with open(file_path, 'r+') as reports_file:
    reader = csv.reader(reports_file, delimiter='\t')
    for row in reader:

        article = row[0]
        summary_sentents = nlp.summarize(text=article)

        summary = '\n'.join(summary_sentents)

        row.append(summary)

        textkeys = keywords(article).keys()

        row.append(textkeys)

        with open((os.path.join(os.getcwd(), 'final_tee.csv')),
                  'a') as newfile:
            writer = csv.writer(newfile, delimiter='\t')
            try:
                writer.writerow(row)

            except Exception, e:
                pass
示例#8
0
def text_keywords():
    text_to_clean = request.args.get('text_to_clean')
    keywords = nlp.keywords(text_to_clean)
    return jsonify(keywords)
示例#9
0
                article.top_image = '' if re.search(
                    'favicon', article.top_image) else article.top_image

                with open('../detector/tl_stopwords.txt', 'r') as f:
                    TL_STOPWORDS = f.read().splitlines()

                STOP_WORDS = ENGLISH_STOP_WORDS.union(TL_STOPWORDS)
                cleaned_body = ' '.join([
                    word for word in body.split()
                    if word.lower() not in STOP_WORDS
                ])
                cleaned_title = ' '.join([
                    word for word in title.split()
                    if word.lower() not in STOP_WORDS
                ])
                text_keyws = list(keywords(cleaned_body).keys())
                title_keyws = list(keywords(cleaned_title).keys())
                keyws = list(set(title_keyws + text_keyws))

                summary = summarize(title=article.title,
                                    text=body,
                                    max_sents=3)

                # keywords = []
                # for key, value in article.keywords.items():
                #     keywords.append({
                #         'word': key,
                #         'score': value
                #     })

                # keywords = sorted(
示例#10
0
def nlp_wrapper(text):
    """Keyword extraction wrapper
    """
    nlp.load_stopwords('en')
    return ' '.join(list(nlp.keywords(text).keys()))
示例#11
0
    def summarize(self, html, percent_sentences):
        if (percent_sentences is None or percent_sentences > 100
                or percent_sentences < 0):
            percent_sentences = 15

        article = self.process_html(html)

        # remove title from the text, if it appears in the text
        if article.text.startswith(article.title):
            article.set_text(article.text[len(article.title):])

        sentences = nlp.split_sentences(article.text)
        log.debug(article.text)

        # remove punctuations, numbers and special characters
        clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")
        clean_sentences = [s.lower() for s in clean_sentences]
        clean_sentences = [
            self._remove_stopwords(r.split()) for r in clean_sentences
        ]

        # create sentence vectors
        sentence_vectors = []
        for i in clean_sentences:
            if len(i) != 0:
                v = sum([
                    self.word_embeddings.get(w, np.zeros((300, )))
                    for w in i.split()
                ]) / (len(i.split()) + 0.001)
            else:
                v = np.zeros((300, ))
            sentence_vectors.append(v)

        # similarity matrix
        sim_mat = np.zeros([len(sentences), len(sentences)])

        # initialize matrix
        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if i != j:
                    sim_mat[i][j] = cosine_similarity(
                        sentence_vectors[i].reshape(1, 300),
                        sentence_vectors[j].reshape(1, 300),
                    )[0, 0]

        # convert matrix into graph
        nx_graph = nx.from_numpy_array(sim_mat)
        textrank_scores = self.normalize_scores(nx.pagerank(nx_graph))

        # get newspaper's nlp scores
        # https://github.com/codelucas/newspaper/blob/master/newspaper/article.py#L372
        nlp.load_stopwords(article.config.get_language())

        # call to: nlp.summarize(title=article.title, text=article.text, max_sents=max_sents)
        # https://github.com/codelucas/newspaper/blob/master/newspaper/nlp.py#L40
        title_words = nlp.split_words(article.title)
        most_frequent = nlp.keywords(article.text)

        nlp_scores = self.normalize_scores(
            nlp.score(sentences, title_words, most_frequent))

        totalled_scores = Counter()
        for key, value in nlp_scores.items():
            totalled_scores[key[0]] += value

        for key, value in textrank_scores.items():
            totalled_scores[key] += value

        num_sentences = int(len(clean_sentences) * percent_sentences / 100)
        sentence_indices = list(
            map(lambda x: x[0], totalled_scores.most_common(num_sentences)))

        return list(map(lambda x: sentences[x], sentence_indices))