Exemplo n.º 1
0
def giveKeyword(text):
    from bs4 import BeautifulSoup
    text = BeautifulSoup(text, "lxml").get_text()

    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    #print text

    cleanText = "".join([
        " " + i if not i.startswith("'") and i not in string.punctuation else i
        for i in text
    ]).strip()

    rakeobj = Rake("SmartStoplist.txt")

    keywords = rakeobj.run(cleanText)

    output = ""

    #take top two highest ranked (if there even are two)
    for keyword in keywords[0:2]:
        output = output + keyword[0] + ","

    return output  #the highest ranked one...
Exemplo n.º 2
0
def rake_extract_concepts(text, exclude, number):
    r = Rake('/usr/share/postgresql/10/tsearch_data/english.stop')
    concepts = r.run(text, minCharacters=2, maxWords=4, minFrequency=3)
    count = 0
    for keyword, weight in concepts:
        if weight > 1 and count < number and keyword not in exclude:
            yield keyword, weight
        count += 1
Exemplo n.º 3
0
def rake(text, top_n=10):

    # Clean the text from non-printable characters.
    text = ''.join(word for word in text if word in printable)

    # Uses all english stopwords and punctuation from NLTK.
    r = Rake(NLTKStopList())
    return [keyphrase for (keyphrase, score) in r.run(text)[:top_n]]
Exemplo n.º 4
0
class KWGet(Driver):
    def setup(self):
        path = os.path.join(os.path.dirname(__file__), STOPWORDS)
        self._rake = Rake(path)

    def _get_keywords(self, text):
        buf = b64decode(text)
        keywords = self._rake.run(buf)
        if PRINT:
            print('KWGet: keywords=%s' % str(keywords))
        return keywords

    @wrapper
    def put(self, *args, **kwargs):
        text = kwargs.get('content')
        if text:
            keywords = self._get_keywords(text)
            if keywords:
                return {'keywords':keywords}
Exemplo n.º 5
0
class KWGet(Driver):
    def setup(self):
        path = os.path.join(os.path.dirname(__file__), STOPWORDS)
        self._rake = Rake(path)

    def _get_keywords(self, text):
        buf = b64decode(text)
        keywords = self._rake.run(buf)
        if PRINT:
            print('KWGet: keywords=%s' % str(keywords))
        return keywords

    @wrapper
    def put(self, *args, **kwargs):
        text = kwargs.get('content')
        if text:
            keywords = self._get_keywords(text)
            if keywords:
                return {'keywords': keywords}
Exemplo n.º 6
0
def giveKeyword(text):
    from bs4 import BeautifulSoup
    text = BeautifulSoup(text,"lxml").get_text()   
    
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    #print text

    cleanText = "".join([" " + i if not i.startswith("'") and i not in string.punctuation else i for i in text]).strip()    

    rakeobj = Rake("SmartStoplist.txt")    
        
    keywords = rakeobj.run(cleanText)  

    output = ""
    
    #take top two highest ranked (if there even are two)
    for keyword in keywords[0:2]:
        output = output + keyword[0] + ",";

    return output  #the highest ranked one...
Exemplo n.º 7
0
class Analyzer:
    '''analyzes text for sentiment and important terms'''
    def __init__(self):
        self.filter = Rake('stoplist.txt')

    def sentiment(self, text):
        '''transform sentiment into trinary value'''
        polarity = TextBlob(text).sentiment.polarity
        if polarity > 0.2:
            sentiment = "1"
        elif polarity < -0.2:
            sentiment = "-1"
        else:
            sentiment = "0"
        return sentiment

    def terms(self, text):
        '''get most important terms from text'''
        if text:
            terms = self.filter.run(text)[0][0]
        else:
            terms = ''
        return terms
Exemplo n.º 8
0
def get_doc_keywords(html, articleDom):
    '''
    Search meta keyword tag for any predefined keywords
    Else use RAKE library to extract keywords from document content
    Return first five keywords 
    '''
    tree = fromstring(html)
    keywords = tree.xpath('//meta[@name="keywords"]/@content')
    if keywords:
        arr = keywords.split(',')[:5]  # return first five keywords
        return [x.strip(' ') for x in arr]
    else:
        # Use RAKE to extract keywords from article contetnt
        from RAKE import Rake
        import operator
        node = fromstring(articleDom)
        text = node.text_content()
        extractor = Rake(
            "RAKE/stoplists/SmartStoplist.txt", 3, 3,
            5)  # min 3 chars, max 3 words, word appears min 5 times
        keywords = [x[0] for x in extractor.run(text)]
        keywords = keywords[:5]  # get top five
        return [x.strip(' ') for x in keywords]
Exemplo n.º 9
0
def get_keyword(text):
    rake = Rake("SmartStoplist.txt")
    if text == "":
        return ""
    keywords = rake.run(text)
    return keywords[0][0]
Exemplo n.º 10
0
def generate_titles(file_name,
                    random=False,
                    use_rake=False,
                    use_summa_text_rank=False,
                    use_text_rank=False):
    logger.info("Opening file")
    text_file = open(file_name)
    logger.info("Reading file")
    raw_text = text_file.read().lower()
    # Remove Unicode characters.
    raw_text = raw_text.decode('unicode_escape').encode('ascii', 'ignore')

    #Convert raw text to word tokens
    logger.info("Tokenizing")
    tokens = nltk.word_tokenize(raw_text.translate(None, string.punctuation))

    #Remove stopwords
    logger.info("Removing stopwords")
    stop_words = set(stopwords.words('english'))
    #NOTE: we need to include some more stopwords, as 'english' doesn't contain some stopwords
    #      related to journal articles (e.g., "et" and "al" in "et al.")
    stop_words.update(ADDITIONAL_STOPWORDS)
    filtered_text = [word for word in tokens if word not in stop_words]

    #Create Corpus object for input text
    logger.info("Creating corpus object")
    input_text = Corpus(raw_text, tokens, filtered_text)
    input_text.stop_words = stop_words

    logger.info("Filtered words to use")
    logger.info("\t %s" % input_text.filtered_tokens[:5])

    #NOTE: stopwords are removed before POS tags assigned, this could
    #      potentially degrade POS tagging performance - may want to
    #      switch this order
    #Demonstrate functions
    logger.info("Getting POS tags")
    input_text.pos_tags = pos_tagger(input_text)
    logger.info("\t %s" % input_text.pos_tags[:5])

    logger.info("Finding all used parts of speech.")
    input_text.used_pos = set([tag[1] for tag in input_text.pos_tags])
    logger.info(input_text.used_pos)

    logger.info("Getting stemmed words")
    input_text.stemmed_words = stem_tokens(input_text)
    logger.info("\t %s" % input_text.stemmed_words[:5])

    # split the stemmed words into ~equal-sized groups
    logger.info("Splitting the stemmed words into groups")
    #logger.info("There are %s words in this group" % len(input_text.stemmed_words))
    num_splits = 2
    input_text.splits = split_tokens(input_text, num_splits)
    #for s in input_text.splits:
    #    logger.info("%s %s\n\n" % (s,len(s)))

    logger.info("Getting word frequency and proximity")
    cutoff = 0.125
    if len(input_text.filtered_tokens) < 250:
        cutoff = 0.35  #33
    input_text.word_freq_proximity = stems_frequency_proximity(
        input_text, cutoff)
    #logger.info("\t %s" % (input_text.word_freq_proximity[u'becom'],))

    logger.info("Mapping filtered words and their stemmed forms")
    input_text.filtered_word_and_bases, input_text.filtered_bases_and_words = stems_and_bases(
        input_text)
    #logger.info("\t %s" % input_text.filtered_word_and_bases[u'becom'])

    logger.info("Mapping POS tags and words")
    input_text.pos_tag_and_words = pos_tags_and_words(input_text)
    #logger.info("\t %s" % input_text.pos_tag_and_words['NNS'][:5])

    logger.info("------ End Processing ------\n\n")

    ##########################

    if use_rake:
        logger.info("------ Begin Rake ------")
        """More information at: https://github.com/fabianvf/python-rake"""

        r = Rake(RAKE.SmartStopList())  #stop_words_list)
        sorted_keywords = r.run(input_text.raw_text)
        logger.info("Sorted keywords: %s" % sorted_keywords[:5])
        logger.info("------ End Rake ------\n\n")

    if use_summa_text_rank:
        logger.info("------ Begin SummaTextRank ------")
        """More information at https://github.com/summanlp/textrank"""
        logger.info("Sentence(s) summary: %s" % summarizer.summarize(raw_text))
        logger.info("Keywords: %s" % keywords.keywords(raw_text))

        logger.info("------ End SummaTextRank ------\n\n")

    if use_text_rank:
        logger.info("------ Begin TextRank ------")
        """More information at https://github.com/davidadamojr/TextRank"""

        logger.info("Sentence(s) summary: %s " %
                    textrank.extract_sentences(raw_text))
        logger.info("Keywords: %s" % textrank.extract_key_phrases(raw_text))

        logger.info("------ End TextRank ------\n\n")

    ##########################

    logger.info("------ Begin Weighting ------")

    logger.info("Calculating word weights")
    input_text.word_weights = get_word_weights(input_text, random)

    logger.info("Printing word weights")
    weight_thresh = -1
    print_words_with_weight_above(weight_thresh, input_text.word_weights,
                                  input_text)

    logger.info("------ End Weighting ------\n\n")

    ##########################

    logger.info("------ Begin Building ------")

    titles = build_titles(input_text)

    logger.info("------ End Building ------\n\n")

    ##########################

    logger.info("Closing file")
    text_file.close()

    ##########################

    logger.info("------ Begin Ranking ------")

    #NOTE: the scores denote the title rankings relative to one another
    #      1 denotes the title with the highest rank and 0 denotes the
    #      title with the lowest rank (determined by a combination of
    #      summed word weights and average word weight)
    titles_ranked = order_titles(titles, input_text)

    logger.info("------ End Ranking ------\n\n")

    ##########################

    return titles_ranked
Exemplo n.º 11
0
import csv
from RAKE import Rake
import email
import os

base = os.getcwd() + '/Data_set'
topic_identifier_instance = Rake('Stop_list.txt')

for folder in os.listdir(base + '/Data_set'):
    fold = base + '/Data_set/{}'.format(folder)
    for file in os.listdir(fold):
        mail = email.message_from_file(open(fold + '/{}'.format(file)))
        message_string = mail.get_payload()
        score_table = topic_identifier_instance.run(message_string)
        score_table.insert(0, ('topic', 'word_score'))
        print(score_table)
        writer = csv.writer(
            open(
                base + '/Result_data_set' + '/' + folder + '/' + file + '.csv',
                'w+'))
        for row in score_table:
            writer.writerow(row)