Пример #1
0
def giveKeyword(text):
    from bs4 import BeautifulSoup
    text = BeautifulSoup(text, "lxml").get_text()

    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    #print text

    cleanText = "".join([
        " " + i if not i.startswith("'") and i not in string.punctuation else i
        for i in text
    ]).strip()

    rakeobj = Rake("SmartStoplist.txt")

    keywords = rakeobj.run(cleanText)

    output = ""

    #take top two highest ranked (if there even are two)
    for keyword in keywords[0:2]:
        output = output + keyword[0] + ","

    return output  #the highest ranked one...
Пример #2
0
def rake_extract_concepts(text, exclude, number):
    r = Rake('/usr/share/postgresql/10/tsearch_data/english.stop')
    concepts = r.run(text, minCharacters=2, maxWords=4, minFrequency=3)
    count = 0
    for keyword, weight in concepts:
        if weight > 1 and count < number and keyword not in exclude:
            yield keyword, weight
        count += 1
Пример #3
0
def rake(text, top_n=10):

    # Clean the text from non-printable characters.
    text = ''.join(word for word in text if word in printable)

    # Uses all english stopwords and punctuation from NLTK.
    r = Rake(NLTKStopList())
    return [keyphrase for (keyphrase, score) in r.run(text)[:top_n]]
Пример #4
0
def giveKeyword(text):
    from bs4 import BeautifulSoup
    text = BeautifulSoup(text,"lxml").get_text()   
    
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    #print text

    cleanText = "".join([" " + i if not i.startswith("'") and i not in string.punctuation else i for i in text]).strip()    

    rakeobj = Rake("SmartStoplist.txt")    
        
    keywords = rakeobj.run(cleanText)  

    output = ""
    
    #take top two highest ranked (if there even are two)
    for keyword in keywords[0:2]:
        output = output + keyword[0] + ",";

    return output  #the highest ranked one...
Пример #5
0
def get_doc_keywords(html, articleDom):
    '''
    Search meta keyword tag for any predefined keywords
    Else use RAKE library to extract keywords from document content
    Return first five keywords 
    '''
    tree = fromstring(html)
    keywords = tree.xpath('//meta[@name="keywords"]/@content')
    if keywords:
        arr = keywords.split(',')[:5]  # return first five keywords
        return [x.strip(' ') for x in arr]
    else:
        # Use RAKE to extract keywords from article contetnt
        from RAKE import Rake
        import operator
        node = fromstring(articleDom)
        text = node.text_content()
        extractor = Rake(
            "RAKE/stoplists/SmartStoplist.txt", 3, 3,
            5)  # min 3 chars, max 3 words, word appears min 5 times
        keywords = [x[0] for x in extractor.run(text)]
        keywords = keywords[:5]  # get top five
        return [x.strip(' ') for x in keywords]
Пример #6
0
class KWGet(Driver):
    def setup(self):
        path = os.path.join(os.path.dirname(__file__), STOPWORDS)
        self._rake = Rake(path)

    def _get_keywords(self, text):
        buf = b64decode(text)
        keywords = self._rake.run(buf)
        if PRINT:
            print('KWGet: keywords=%s' % str(keywords))
        return keywords

    @wrapper
    def put(self, *args, **kwargs):
        text = kwargs.get('content')
        if text:
            keywords = self._get_keywords(text)
            if keywords:
                return {'keywords': keywords}
Пример #7
0
class KWGet(Driver):
    def setup(self):
        path = os.path.join(os.path.dirname(__file__), STOPWORDS)
        self._rake = Rake(path)

    def _get_keywords(self, text):
        buf = b64decode(text)
        keywords = self._rake.run(buf)
        if PRINT:
            print('KWGet: keywords=%s' % str(keywords))
        return keywords

    @wrapper
    def put(self, *args, **kwargs):
        text = kwargs.get('content')
        if text:
            keywords = self._get_keywords(text)
            if keywords:
                return {'keywords':keywords}
Пример #8
0
class Analyzer:
    '''analyzes text for sentiment and important terms'''
    def __init__(self):
        self.filter = Rake('stoplist.txt')

    def sentiment(self, text):
        '''transform sentiment into trinary value'''
        polarity = TextBlob(text).sentiment.polarity
        if polarity > 0.2:
            sentiment = "1"
        elif polarity < -0.2:
            sentiment = "-1"
        else:
            sentiment = "0"
        return sentiment

    def terms(self, text):
        '''get most important terms from text'''
        if text:
            terms = self.filter.run(text)[0][0]
        else:
            terms = ''
        return terms
Пример #9
0
 def __init__(self):
     self.filter = Rake('stoplist.txt')
Пример #10
0
        os.path.dirname(
            os.path.dirname(os.path.dirname(os.path.realpath(__file__)))),
        'NEL'))
from elasticsearch import Elasticsearch
import json
import time
import math
import stringmatching_npm as stringmatch
from dateutil import parser as dateparser
from datetime import datetime
from collections import Counter
from RAKE import Rake
import requests
from markdown_extract import extractText

rk = Rake('SmartStoplist.txt')


def get_text_content(pkg):
    desc = pkg.get('desc', '')
    keywords = pkg.get('keywords', [])
    readme = pkg.get('readme', '') or ''
    if type(readme) != str and type(readme) != unicode:
        # print 'No Readme Found'
        readme = ''
    readmeText = extractText(readme)
    # print 'readme', readmeText
    parsedKeywords = rk.run(readmeText)
    # print 'rake', parsedKeywords
    results = []
    for kw in keywords:
Пример #11
0
import spacy
from RAKE import Rake
from nltk.stem.porter import PorterStemmer

NLP = spacy.load('en_core_web_sm')
RAKE = Rake('stopwords.txt')
PSTEMMER = PorterStemmer()


def find_entities(sentence):
    doc = NLP(sentence)

    return [entity.text for entity in doc.ents]


def extract_keywords(sentence):
    return [w for s, _ in RAKE.run(sentence) for w in s.split()]


def stemmer(word):
    stemedword = PSTEMMER.stem(word)
    return stemedword



def test():
    with open('test_sentences.txt') as f:
        sentences = f.read().split('\n')
    for sentence in sentences:
        print(sentence)
        print(find_entities(sentence))
Пример #12
0
def test_instantiate():
    rake = Rake(SmartStopList())
    assert rake._Rake__stop_words_pattern
    rake = RAKE.Rake(RAKE.SmartStopList())
    assert rake._Rake__stop_words_pattern
Пример #13
0
 def setup(self):
     path = os.path.join(os.path.dirname(__file__), STOPWORDS)
     self._rake = Rake(path)
Пример #14
0
def get_keyword(text):
    rake = Rake("SmartStoplist.txt")
    if text == "":
        return ""
    keywords = rake.run(text)
    return keywords[0][0]
Пример #15
0
def generate_titles(file_name,
                    random=False,
                    use_rake=False,
                    use_summa_text_rank=False,
                    use_text_rank=False):
    logger.info("Opening file")
    text_file = open(file_name)
    logger.info("Reading file")
    raw_text = text_file.read().lower()
    # Remove Unicode characters.
    raw_text = raw_text.decode('unicode_escape').encode('ascii', 'ignore')

    #Convert raw text to word tokens
    logger.info("Tokenizing")
    tokens = nltk.word_tokenize(raw_text.translate(None, string.punctuation))

    #Remove stopwords
    logger.info("Removing stopwords")
    stop_words = set(stopwords.words('english'))
    #NOTE: we need to include some more stopwords, as 'english' doesn't contain some stopwords
    #      related to journal articles (e.g., "et" and "al" in "et al.")
    stop_words.update(ADDITIONAL_STOPWORDS)
    filtered_text = [word for word in tokens if word not in stop_words]

    #Create Corpus object for input text
    logger.info("Creating corpus object")
    input_text = Corpus(raw_text, tokens, filtered_text)
    input_text.stop_words = stop_words

    logger.info("Filtered words to use")
    logger.info("\t %s" % input_text.filtered_tokens[:5])

    #NOTE: stopwords are removed before POS tags assigned, this could
    #      potentially degrade POS tagging performance - may want to
    #      switch this order
    #Demonstrate functions
    logger.info("Getting POS tags")
    input_text.pos_tags = pos_tagger(input_text)
    logger.info("\t %s" % input_text.pos_tags[:5])

    logger.info("Finding all used parts of speech.")
    input_text.used_pos = set([tag[1] for tag in input_text.pos_tags])
    logger.info(input_text.used_pos)

    logger.info("Getting stemmed words")
    input_text.stemmed_words = stem_tokens(input_text)
    logger.info("\t %s" % input_text.stemmed_words[:5])

    # split the stemmed words into ~equal-sized groups
    logger.info("Splitting the stemmed words into groups")
    #logger.info("There are %s words in this group" % len(input_text.stemmed_words))
    num_splits = 2
    input_text.splits = split_tokens(input_text, num_splits)
    #for s in input_text.splits:
    #    logger.info("%s %s\n\n" % (s,len(s)))

    logger.info("Getting word frequency and proximity")
    cutoff = 0.125
    if len(input_text.filtered_tokens) < 250:
        cutoff = 0.35  #33
    input_text.word_freq_proximity = stems_frequency_proximity(
        input_text, cutoff)
    #logger.info("\t %s" % (input_text.word_freq_proximity[u'becom'],))

    logger.info("Mapping filtered words and their stemmed forms")
    input_text.filtered_word_and_bases, input_text.filtered_bases_and_words = stems_and_bases(
        input_text)
    #logger.info("\t %s" % input_text.filtered_word_and_bases[u'becom'])

    logger.info("Mapping POS tags and words")
    input_text.pos_tag_and_words = pos_tags_and_words(input_text)
    #logger.info("\t %s" % input_text.pos_tag_and_words['NNS'][:5])

    logger.info("------ End Processing ------\n\n")

    ##########################

    if use_rake:
        logger.info("------ Begin Rake ------")
        """More information at: https://github.com/fabianvf/python-rake"""

        r = Rake(RAKE.SmartStopList())  #stop_words_list)
        sorted_keywords = r.run(input_text.raw_text)
        logger.info("Sorted keywords: %s" % sorted_keywords[:5])
        logger.info("------ End Rake ------\n\n")

    if use_summa_text_rank:
        logger.info("------ Begin SummaTextRank ------")
        """More information at https://github.com/summanlp/textrank"""
        logger.info("Sentence(s) summary: %s" % summarizer.summarize(raw_text))
        logger.info("Keywords: %s" % keywords.keywords(raw_text))

        logger.info("------ End SummaTextRank ------\n\n")

    if use_text_rank:
        logger.info("------ Begin TextRank ------")
        """More information at https://github.com/davidadamojr/TextRank"""

        logger.info("Sentence(s) summary: %s " %
                    textrank.extract_sentences(raw_text))
        logger.info("Keywords: %s" % textrank.extract_key_phrases(raw_text))

        logger.info("------ End TextRank ------\n\n")

    ##########################

    logger.info("------ Begin Weighting ------")

    logger.info("Calculating word weights")
    input_text.word_weights = get_word_weights(input_text, random)

    logger.info("Printing word weights")
    weight_thresh = -1
    print_words_with_weight_above(weight_thresh, input_text.word_weights,
                                  input_text)

    logger.info("------ End Weighting ------\n\n")

    ##########################

    logger.info("------ Begin Building ------")

    titles = build_titles(input_text)

    logger.info("------ End Building ------\n\n")

    ##########################

    logger.info("Closing file")
    text_file.close()

    ##########################

    logger.info("------ Begin Ranking ------")

    #NOTE: the scores denote the title rankings relative to one another
    #      1 denotes the title with the highest rank and 0 denotes the
    #      title with the lowest rank (determined by a combination of
    #      summed word weights and average word weight)
    titles_ranked = order_titles(titles, input_text)

    logger.info("------ End Ranking ------\n\n")

    ##########################

    return titles_ranked
Пример #16
0
import csv
from RAKE import Rake
import email
import os

base = os.getcwd() + '/Data_set'
topic_identifier_instance = Rake('Stop_list.txt')

for folder in os.listdir(base + '/Data_set'):
    fold = base + '/Data_set/{}'.format(folder)
    for file in os.listdir(fold):
        mail = email.message_from_file(open(fold + '/{}'.format(file)))
        message_string = mail.get_payload()
        score_table = topic_identifier_instance.run(message_string)
        score_table.insert(0, ('topic', 'word_score'))
        print(score_table)
        writer = csv.writer(
            open(
                base + '/Result_data_set' + '/' + folder + '/' + file + '.csv',
                'w+'))
        for row in score_table:
            writer.writerow(row)
Пример #17
0
 def setup(self):
     path = os.path.join(os.path.dirname(__file__), STOPWORDS)
     self._rake = Rake(path)