Exemplo n.º 1
0
    def extractKeywords(self):
        with open(self.name, 'r') as myfile:
            text = myfile.read().replace('\n', '')
        # print "text is  ", text

        #using constraint where each keyword appears in text at least twice
        rake_object = rake.Rake("SmartStoplist.txt", 3, 3, 1)
        keywords = rake_object.run(text)
        print("keywords1 are ", keywords)

        #using constraint where each keyword appears in text at least three times
        rake_object = rake.Rake("SmartStoplist.txt", 3, 3, 2)
        keywords = rake_object.run(text)
        print("keywords2 are ", keywords)
Exemplo n.º 2
0
def run_rake_standalone(stop_word_list, text):
    extractor = rake.Rake(
        stop_word_list
    )  # It uses SmartStoplist, you can choose FoxStopList instead
    keywords = extractor.run(text)

    return keywords
Exemplo n.º 3
0
def search():
    text = tbox1.get()
    print(text)
    stoppath = "data/stoplists/SmartStoplist.txt"
    rake_object = rake.Rake(stoppath, 5, 3, 4)
    keywords = rake_object.run(text)
    print("Keywords:", keywords)
    l = (len(keywords))
    print(l)
    for i in keywords:
        x = i
        print(x[0])
        try:
            from googlesearch import search
        except ImportError:
            print("No module named 'google' found")
        query = x[0]
        for j in search(query, tld="co.in", num=1, stop=1, pause=2):
            print(j)
            url = j
            values = {'s': 'basics', 'submit': 'search'}
            data = urllib.parse.urlencode(values)
            data = data.encode('utf-8')
            req = urllib.request.Request(url, data)
            resp = urllib.request.urlopen(req)
            respData = resp.read()
            paragraphs = re.findall(r'<p>(.*?)</p>', str(respData))
            for eachP in paragraphs:
                print(eachP)
Exemplo n.º 4
0
    def keywords_from_tweet_list(self,
                                 tweet_list,
                                 num_characters=3,
                                 max_phrase=3,
                                 remove_repeats=False):
        """Uses RAKE to output the keywords from a tweet list.
        
        Args:
            tweet_list (list) : a block of tweets
            num_characters (int) : minimum number of characters in a keyword. Default is 3
            max_phrase (int) : maximum amount of words in a keyword phrase. Default is 3.
        
        Returns:
            list : list of tuples containing the keyword and it's Rake score 
        """
        num_of_tweets = len(tweet_list)
        joined_tweets = ' '.join(itertools.chain(*tweet_list))
        rake_object = rake.Rake("SmartStoplist.txt", num_characters,
                                max_phrase, num_of_tweets / 1000)
        self.keywords = rake_object.run(joined_tweets)

        if remove_repeats:
            self.keywords = self.__remove_shorter_repeats(self.keywords)

        return self.keywords
def get_best_params(test_doc, test_set):
    best_fmeasure = 0
    best_vals = []

    for min_char_length in range(3, 8):
        for max_words_length in range(3, 6):
            for min_keyword_frequency in range(1, 7):

                rake_object = rake.Rake('SmartStoplist.txt', min_char_length, max_words_length, min_keyword_frequency)
                total_fmeasure = 0
                keywords = rake_object.run(test_doc.text)

                num_manual_keywords = len(test_doc.keywords)
                correct = 0
                try:
                    for i in range(0, min(3, len(keywords))):
                        if keywords[i][0] in set(test_doc.keywords):
                            correct += 1
                except IndexError:
                    print('Problem with evaluating ', keywords)

                precision = correct / float(3)
                recall = correct / float(num_manual_keywords)

                if precision > 0 and recall > 0:
                    total_fmeasure += 2 * precision * recall / (precision + recall)

                avg_fmeasure = round(total_fmeasure * 100 / float(len(test_set)), 2)

                if avg_fmeasure > best_fmeasure:
                    best_fmeasure = avg_fmeasure
                    best_vals = [min_char_length, max_words_length, min_keyword_frequency]

    return best_vals
Exemplo n.º 6
0
 def rake_phrase(self):
     """get phrases according to rake"""
     rake_object = rake.Rake(self.stoppath)
     phrase = rake_object.run(self.primary_text)
     self.phrase_rake_list = phrase[:self.number_of_keywords]
     print(f'phrases according to rake: {self.phrase_rake_list}')
     return self.phrase_rake_list
Exemplo n.º 7
0
def extractPostKeywords(title, post):
    keywords = []

    # Module used to mine text for keywords, parameters are:
    # word stop list, keyword min length, keyphrase min length, and keyword frequency
    rakeTextMiner = rake.Rake("SmartStoplist.txt", 3, 3, 1)
    textToAnalyze = title + " " + post
    # List of tuples (keyword, score)
    keywordsAndPhrases = rakeTextMiner.run(textToAnalyze)

    for wordOrPhraseTuple in keywordsAndPhrases:
        tokenizedPhrase = wordOrPhraseTuple[0].split(' ')

        phrase = tokenizedPhrase[-1]
        keywords.append(phrase)

        wordNumber = 0
        for word in reversed(tokenizedPhrase):
            if wordNumber == 0:
                wordNumber += 1
                continue
            keywords.append(word)
            phrase = word + '_' + phrase
            keywords.append(phrase)

    return keywords
    def get_final_keywords(self, test_doc, test_set):
        """
        Gets optimum parameters and initializes a rake object using them.
        Gets keywords and add them to the .key file under the same name as test_doc
        :param test_doc: document to extract keywords from
        :param test_set: set of documents
        """

        # Gets optimum parameters for document
        best_params = optimize_rake.get_best_params(test_doc, test_set)
        # Initializes rake object using optimized parameters
        rake_object_final = rake.Rake('SmartStoplist.txt',
                                      best_params[0],
                                      best_params[1],
                                      best_params[2])

        # Get keywords and opens .key file
        keywords = rake_object_final.run(test_doc.text)
        key_file = open(os.path.join(self.test_dir, test_doc.name + '.key'), 'w')

        # Add keywords to .key file along with scores
        for keyword in keywords:
            key = keyword[0]
            score = str(round(keyword[1], 1))
            key_file.write(key + '.'*(30-len(key)) + score + '\n')
Exemplo n.º 9
0
def extractkeywords(url):
    import rake, operator, re
    html = gethtml(url)
    if not html:
        return None
    urlkws = urltokw(url)
    brandfilter = brand(url)
    title = scraper(html, '//title/text()')
    description = scraper(
        html,
        "//meta[translate(@name, 'ABCDEFGHJIKLMNOPQRSTUVWXYZ', 'abcdefghjiklmnopqrstuvwxyz')='description']/@content"
    )
    scrubbed = barebones(html)
    newtxt = "%s %s %s %s" % (title, description, scrubbed, urlkws)
    #newtxt = "%s %s %s %s" % (urlkws, title, description, scrubbed)
    newtxt = newtxt.replace('\n', ' ')
    newtxt = re.sub('<[^<]+?>', ' ', newtxt)
    newtxt = re.sub(' +', ' ', newtxt)
    rake_object = rake.Rake("/var/pipulate/SmartStoplist.txt", 3, 4, 2)
    keywords = rake_object.run(newtxt)
    stackum = ''
    for keyword in keywords:
        kw = keyword[0]
        candidate = kw.split()
        if len(candidate) > 1:
            if kw.replace(' ', '') != brandfilter.replace(' ', ''):
                stackum += keyword[0] + '\n'
    if stackum:
        return stackum
    else:
        return None
def RakeExtract(text, stoppath="SmartStoplist.txt"):
    """
    :param text: string type
    :param stoppath:  stopword list
    :return:    a list of tuples where the 1st index is the keyword and the second index the score
    """

    # Each word has at least 5 characters, each phrase has at most 3 words
    rake_object = rake.Rake(stoppath, 2, 2, 1)

    # Splits the text into sentences
    sentenceList = rake.split_sentences(text)
    stopwordpattern = rake.build_stop_word_regex(stoppath)

    # Generate Candidates
    phraseList = rake.generate_candidate_keywords(sentenceList,
                                                  stopwordpattern)
    # print "Candidate Keywords: ", phraseList

    wordscores = rake.calculate_word_scores(phraseList)
    keywordcandidates = rake.generate_candidate_keyword_scores(
        phraseList, wordscores)

    sortedKeywords = sorted(keywordcandidates.iteritems(),
                            key=operator.itemgetter(1),
                            reverse=True)
    totalKeywords = len(sortedKeywords)
    keywords = sortedKeywords[0:(totalKeywords / 3)]

    return keywords
Exemplo n.º 11
0
def get_keywords(text):
    rake_obj = rake.Rake(
        os.path.join(BASE_DIR, 'job_match', 'stop_words.txt'),
        3,  # at least xxx char in each keyword
        5,  # at most xxx words in the phrase
        2)  # at least appear xxx times
    return rake_obj.run(text)[0:3]  # return first 3 keywords list
Exemplo n.º 12
0
def get_keywords(paragraph):
	kw_result = []
	rake_object = rake.Rake("SmartStoplist.txt", 3, 1, 1)
	keywords = rake_object.run(paragraph)
	for word in keywords:
		kw_result.append(word[0])
		if(len(kw_result) == 1): break
	return kw_result
Exemplo n.º 13
0
def main():
    try:
        rake_object = rake.Rake("SmartStoplist.txt", 5, 3, 1)
        for line in sys.stdin:
            print(process(rake_object, line.strip()))
    except:
        print("FAIL! " + line, file=sys.stderr)
        raise
Exemplo n.º 14
0
def get_keyword_confidence(message):
    rake_object = rake.Rake("SmartStoplist.txt", 3, 3, 1)
    keywords = rake_object.run(message)
    results = {}
    for tuple in keywords:
        if tuple[1] >= 3:
            results[tuple[0]] = round(tuple[1],1)
    return results
Exemplo n.º 15
0
 def rakes(raw_string):
     rake_object = rake.Rake(
         "/home/anubhav/Desktop/Maui Final/SmartStoplist.txt", 3, 2, 1)
     rakescore = {}
     keywords = rake_object.run(raw_string)
     for j in range(len(keywords)):
         rakescore[keywords[j][0]] = keywords[j][1]
     return rakescore
def autoTag():
    abstract = request.values['abstract']
    rake_object = rake.Rake("SmartStoplist.txt")
    print rake_object
    keywords = rake_object.run(abstract)
    shortenedPhrasesList = shortPhrases(keywords, 2)

    shortenedPhrasesList = zip(*sorted(shortenedPhrasesList, key=lambda arr: arr[1]))[0]
    return jsonify(keywords=shortenedPhrasesList[-min(len(shortenedPhrasesList),10):])
Exemplo n.º 17
0
def findSubject(post):
    stoppath = "SmartStoplist.txt"

    # 1. initialize RAKE by providing a path to a stopwords file
    rake_object = rake.Rake(stoppath, 2, 2, 2)

    # 2. run on RAKE on a given text
    keywords = rake_object.run(post)

    return keywords
Exemplo n.º 18
0
	def getKeywords(self,content):
			rake_object = rake.Rake("SmartStoplist.txt",3,2)
			keywords = rake_object.run(content)
			keywordWeights = sorted(set([word[1] for word in keywords]),reverse=True)
			requiredCount = round(0.5*len(keywordWeights))

			requiredWeights = keywordWeights[:requiredCount]

			requiredKeywords = [word[0] for word in keywords if word[1] in requiredWeights]
			return requiredKeywords
def process(user):
    API_KEY = ")e55ob6fBvCtSTibWPyP*A(("
    site = stackexchange.Site(stackexchange.StackOverflow, API_KEY,  impose_throttling = True)
    uname = user
    user = site.user(user)
    question=request.form['question']
    rake_object = rake.Rake("SmartStoplist.txt", 3, 5, 1)
    keywords = rake_object.run(question)
    print "keywords: ", keywords
    recent = site.recent_questions()
    return render_template('postfinal.html', user=user, site=site, keywords=keywords, question=question, recent=recent, uname = uname)
Exemplo n.º 20
0
def execute_rake(text):
    stoppath = "StemStoplist.txt"
    rake_object = rake.Rake(stoppath,max_words_length=3)


    keywords = rake_object.run(text)
    max_value = max(keywords,key=lambda item:item[1])[1]
    normalized_keywords = [(word[0], word[1]/max_value) for word in keywords]
    normalized_keywords = sorted(normalized_keywords,key=lambda x: x[1],reverse=True)

    return normalized_keywords
Exemplo n.º 21
0
 def rake_keywords(self):
     """get keywords according to rake"""
     stoppath = "data/stoplists/SmartStoplist.txt"
     rake_object = rake.Rake(stoppath, 3, 3, 4)
     self.keywords_rake_list = rake_object.run(self.primary_text)
     try:
         self.keywords_rake_list = self.keywords_rake_list[:self.
                                                           number_of_keywords]
     except:
         pass
     print("Keywords according to rake:", self.keywords_rake_list)
     return self.keywords_rake_list
Exemplo n.º 22
0
def extractKeyWords(description):
    keyWords = []
    text = "Senior Software Engineer at Continental Automotive Group February 2008 - Present (7 years 9 months) Windows application developement "
    stoppath = "expStopList.txt"
    rake_object = rake.Rake(stoppath, 3, 3, 1)
    results = rake_object.run(description)
    #print results
    resultsLen = len(results)
    for x in range(0, resultsLen):
        keyWords.append(results[x][0])

    return keyWords
Exemplo n.º 23
0
def keyword_tokenize(text):
    '''
    INPUT: String
    OUTPUT: Tokenized String
    '''
    stoppath = 'RAKE-tutorial/SmartStoplist.txt'
    rake_object = rake.Rake(stoppath, 5, 3, 4)
    sentenceList = rake.split_sentences(text)
    stopwordpattern = rake.build_stop_word_regex(stoppath)
    phraseList = rake.generate_candidate_keywords(sentenceList,
                                                  stopwordpattern)
    wordscores = rake.calculate_word_scores(phraseList)
    return phraseList
Exemplo n.º 24
0
def main():
    #runRake()
    #compare()
    fn = sys.argv[1]
    for filename in os.listdir(dataDir):
        if fn in filename:
            filepath = os.path.join(dataDir, filename)
            rake_object = rake.Rake(stoppath, 5, 2, 2)
            sample_file = io.open(filepath, 'r')
            text = sample_file.read()
            keywords = rake_object.run(text)
            print(keywords)
            break
Exemplo n.º 25
0
def getKeywords(text):
    ##new rake object
    rake_o = rake.Rake("SmartStoplist.txt", 2, 3, 4)
    ##filter out keywords
    keywords = rake_o.run(text)
    ##new array for keywords
    results = [""]*(len(keywords))

    ##iterate over array to extract keywords from tuple
    for i in range(0, len(keywords)):
        results[i] = str(keywords[i][0])

    return(results)
Exemplo n.º 26
0
def make_vocab(data_file):
    vocab_counter = collections.Counter()
    extract = ""

    data_list = read_text_file(data_file)

    vocab_counter = collections.Counter()

    for idx, s in enumerate(data_list):
        if os.path.isfile(os.path.join(cnn_tokenized_stories_dir, s)):
            story_file = os.path.join(cnn_tokenized_stories_dir, s)

        elif os.path.isfile(os.path.join(dm_tokenized_stories_dir, s)):
            story_file = os.path.join(dm_tokenized_stories_dir, s)

        article_lines, abstract, word_extract_lines = get_art_abs(story_file)

        article = ' '.join([line for line in article_lines])
        word_extract = ' '.join([line for line in word_extract_lines])

        #  article=SYB_RE.sub(r' ',article)
        article = _DIGIT_RE.sub(b" 0", article)
        # word_extract=SYB_RE.sub(r' ',word_extract)
        word_extract = _DIGIT_RE.sub(b" 0 ", word_extract)

        extract = extract + ' ' + word_extract
        art_tokens = article.split(' ')
        abs_tokens = abstract.split(' ')
        tokens = art_tokens + abs_tokens
        tokens = [t.strip() for t in tokens]  # strip
        tokens = [t for t in tokens if t != ""]  # remove empty
        vocab_counter.update(tokens)

    print "Writing vocab file..."
    with open(os.path.join(finished_files_dir, "vocab"), 'w') as writer:
        for word, count in vocab_counter.most_common(VOCAB_SIZE):
            writer.write(word + ' ' + str(count) + '\n')
    print "Finished writing vocab file"

    rake_object = rake.Rake("RAKE-tutorial/SmartStoplist.txt", 1, 1, 1)
    rake_keywords = rake_object.run(extract)
    keys = [k[0] for k in rake_keywords if not k[0].startswith('@entity')]

    keys = keys[:VOCAB_SIZE]

    print "Writing keyword file..."
    with open(os.path.join(finished_files_dir, "keyword"), 'w') as writer:
        for word in keys:
            #      if word in set(vocab_counter.elements()):
            writer.write(word + '\n')
    print "Finished writing vocab file"
Exemplo n.º 27
0
def findKeywords(company_news):
    stoppath = "../Data/data/stoplists/SmartStoplist.txt"

    rake_obj = rake.Rake(stoppath, 5, 3, 4)

    sample_file = io.open("../Data/data/docs/fao_test/w2167e.txt",
                          'r',
                          encoding="iso-8859-1")
    text = sample_file.read()

    keywords = rake_obj.run(text)

    rake_obj = rake.Rake(stoppath)

    text = company_news

    sentences = rake.split_sentences(text)

    stop_words = rake.load_stop_words(stoppath)
    stop_pattern = rake.build_stop_word_regex(stoppath)
    phraseList = rake.generate_candidate_keywords(sentences, stop_pattern,
                                                  stop_words)

    wscores = rake.calculate_word_scores(phraseList)

    keywordcandidates = rake.generate_candidate_keyword_scores(
        phraseList, wscores)
    keywords = sorted(six.iteritems(keywordcandidates),
                      key=operator.itemgetter(1),
                      reverse=True)
    totalKeywords = len(keywords)

    keyword_list = rake_obj.run(text)[0:10]
    keyword_list1 = []
    for i in keyword_list:
        keyword_list1.append(i[0])

    return keyword_list1
Exemplo n.º 28
0
def generateAnswerKeywords(correctAnswer):
    rake_object = rake.Rake("RAKE-tutorial\SmartStoplist.txt", 3, 3, 1)
    answerKeywordsTupleList = rake_object.run(correctAnswer)
    answerKeywordsUnsplit = []
    answerKeywords = []
    for t in answerKeywordsTupleList:
        answerKeywordsUnsplit.append(t[0])
    for k in answerKeywordsUnsplit:
        answerKeywords = answerKeywords + k.split()
    # print "correct answer is:"
    # print correctAnswer
    # print "correct answer keywords are:"
    # print answerKeywords
    return answerKeywords
Exemplo n.º 29
0
def rake_call(final_string, page_no):
    min_chars = 5
    max_words = 5
    if page_no <= 10:
        min_freq = 2
    elif page_no <= 30:
        min_freq = 3
    elif page_no <= 100:
        min_freq = 4
    else:
        min_freq = 8
    
    rake_object = rake.Rake(stoppath, min_chars, max_words, min_freq)
#   print "Rake call: (stoppath, %s, %s, %s )" % (min_chars, max_words, min_freq) 
    keywords = rake_object.run(final_string)
    return post_process(keywords)
Exemplo n.º 30
0
def get_keyphrases():
    stoppath = 'SmartStoplist.txt'
    filename = request.form['name']
    surveys = pd.read_excel(filename, header=0)
    col_name = request.form['question']
    group_by = request.form['group_by']

    min_char_length = int(request.form['min_char_length'])
    min_words_length = int(request.form['min_words_length'])
    max_words_length = int(request.form['max_words_length'])
    min_keyword_frequency = int(request.form['min_keyword_frequency'])
    trade_off = float(request.form['trade_off'])
    top_n = int(request.form['top_n'])

    rake_object = rake.Rake(stoppath, min_char_length, min_words_length,
                            max_words_length, min_keyword_frequency, 1, 3, 2)

    grouped_results = []

    if group_by != '':
        for group in surveys[group_by].unique():
            text = ''
            surveys_subset = surveys.loc[surveys[group_by] == group]
            col = surveys_subset[col_name]

            for i in col.index:
                text = text + " " + col[i]

            keywords_score, keywords_counts, stem_counts = rake_object.run(
                text, trade_off, top_n)
            grouped_results.append(
                (group, keywords_score, keywords_counts, stem_counts))

    else:
        text = ''
        col = surveys[col_name]

        for i in col.index:
            text = text + " " + col[i]

        keywords_score, keywords_counts, stem_counts = rake_object.run(
            text, trade_off, top_n)
        grouped_results.append(
            ('Results:', keywords_score, keywords_counts, stem_counts))

    return render_template("keyphrase_result.html",
                           **{'context': grouped_results})