Exemplo n.º 1
0
    def hello(self, sentence):    	
    	'''
	tokenized_sentence = word_tokenize(sentence)
	punctuation = re.compile(r'[-.?!,":;()|0-9]')
	tokenized_sentence = list(filter(None,tokenized_sentence))
	tokenized_sentence = [punctuation.sub("", word) for word in tokenized_sentence]
	extracted = []
	for w in tokenized_sentence:
		if (w.lower() not in stopwords.words('english') and w != ""):
			extracted.append(w)
	tagged_sent = pos_tag(extracted)
	interest_types = ["NN","NNP","NNS","VBG","VB"]
	for tagged in tagged_sent:
	    word_type = tagged[1]
	    if word_type in interest_types:
	        if (tagged[0] not in extracted and tagged[0] != ""):
	            extracted.append(tagged[0])
	importantwords = ', '.join(extracted)
	'''
	extracted = []
	
	rake = Rake("SmartStoplist.txt")

	keywords = rake.run(sentence)

        return json.dumps([dict(name=keyword[0],weight=keyword[1]) for keyword in keywords])
Exemplo n.º 2
0
    def hello(self, sentence):
        '''
	tokenized_sentence = word_tokenize(sentence)
	punctuation = re.compile(r'[-.?!,":;()|0-9]')
	tokenized_sentence = list(filter(None,tokenized_sentence))
	tokenized_sentence = [punctuation.sub("", word) for word in tokenized_sentence]
	extracted = []
	for w in tokenized_sentence:
		if (w.lower() not in stopwords.words('english') and w != ""):
			extracted.append(w)
	tagged_sent = pos_tag(extracted)
	interest_types = ["NN","NNP","NNS","VBG","VB"]
	for tagged in tagged_sent:
	    word_type = tagged[1]
	    if word_type in interest_types:
	        if (tagged[0] not in extracted and tagged[0] != ""):
	            extracted.append(tagged[0])
	importantwords = ', '.join(extracted)
	'''
        extracted = []

        rake = Rake("SmartStoplist.txt")

        keywords = rake.run(sentence)

        return json.dumps(
            [dict(name=keyword[0], weight=keyword[1]) for keyword in keywords])
Exemplo n.º 3
0
 def generate_rake_keywords(self):
     if os.path.exists(self.keyword_file) and os.path.exists(
             self.rake_score_file):
         print(
             'Already have the files [' + self.keyword_file + ', ' +
             self.rake_score_file + ']', ', directly load them.')
         self.load()
         return
     self.dataset.load()
     if self.path is None:
         self.path = self.dataset.default_path
     ger_stop_words = stopwords.words('german')
     stop_words = stopwords.words('english')
     stop_words.extend(ger_stop_words)
     stop_words.extend(['via', 'using', 'fr'])
     r = Rake(stop_words)
     r.extract_keywords_from_sentences(self.dataset.titles)
     path = os.path.join(self.path, 'keyword.dat')
     print('generate keywords', end='', flush=True)
     with open(path, 'wb') as f:
         i = 0
         for title in self.dataset.titles:
             i += 1
             if i % 100000 == 0:
                 print('.', end='', flush=True)
             phrases = r.generate_phrases(title)
             phrases = [' '.join(phrase) for phrase in phrases]
             self.keywords.append(phrases)
         pickle.dump(self.keywords, f)
     self.rake_scores = r.phrase_score
     path = os.path.join(self.path, 'rake_score.dat')
     with open(path, 'wb') as f:
         pickle.dump(self.rake_scores, f)
     print('done')
 def get_phrases(sents,search_text,res_ind):
     '''
     :param sents: list of sentences for search
     :param search_text: search text
     :res_ind: indices of best matching sents
     :return: phrases from query and top results
     '''
     full_text=' . '.join([sents[i] for i in res_ind])
     full_text = full_text +' . '+search_text
     rake = Rake()
     keys = rake.run(full_text)
     print keys
     query_phrases=[]
     query_words=word_tokenize(search_text)
     for phr,score in keys:
         words=word_tokenize(phr)
         flag_present=1
         for word in words:
             if word not in query_words:
                 flag_present=0
         if flag_present==1:
             query_phrases.append((phr,score))
     print query_phrases
     ###change the phrase to all possible synonyms, find the phrase with maximum match
     ###look for the nearest answer type to that phrase
     return keys
def get_keywords(text):
    """
    Gets main keywords using RAKE Algorithm

    """
    rake = Rake("SmartStoplist.txt")
    keywords = rake.run(text)
    return [k[0] for k in keywords if len(k[0].split(" ")) <= 2 and k[1] > 1]
Exemplo n.º 6
0
def get_keywords(text, stopwords="SmartStoplist.txt"):
    #commented out text below uses the rake-tutorial code, which I like better, but is less recently updated
    #https://github.com/zelandiya/RAKE-tutorial
    #phrase_max_words = 3
    #min_word_chars = 5
    #min_kw_repeat_rate = 4
    #rake = Rake(stopwords, min_word_chars, phrase_max_words, min_kw_repeat_rate)
    rake = Rake(stopwords)
    keywords = rake.run(text)
    return keywords
Exemplo n.º 7
0
def get_keywords(text, stopwords="SmartStoplist.txt"):
    #commented out text below uses the rake-tutorial code, which I like better, but is less recently updated
    #https://github.com/zelandiya/RAKE-tutorial
    #phrase_max_words = 3
    #min_word_chars = 5
    #min_kw_repeat_rate = 4
    #rake = Rake(stopwords, min_word_chars, phrase_max_words, min_kw_repeat_rate)
    rake = Rake(stopwords)
    keywords = rake.run(text)
    return keywords
Exemplo n.º 8
0
def get_rake_kp(file_name, topk):
    json_file = open(file_name, 'r', encoding='utf-8')
    rake_kp = []
    for line in json_file.readlines():
        json_data = json.loads(line)
        cur_content = json_data['title'].strip().lower(
        ) + ' ' + json_data['abstract'].strip().lower()
        content_list = nltk.word_tokenize(cur_content)
        rake = Rake()
        keywords_dict = rake.run(cur_content)
        keywords_list = list(keywords_dict.keys())[:topk]
        kp_list = get_kp(content_list, keywords_list)
        rake_kp.append(kp_list)
    json_file.close()
    return rake_kp
Exemplo n.º 9
0
def get_sentence_keyword_score(document, num_sentences):
    rake = Rake()
    keywords = rake.get_keywords(document)
    ranked_keywords = rake.generate_keyword_rank(keywords)
    sufficient_keywords_length = int(math.ceil(len(ranked_keywords) / 4.0))
    sufficient_keywords = ranked_keywords[:sufficient_keywords_length]
    total_keyword_score = 0.0
    # value of a keyword is its relative score value divided by the score of all keywords
    sentence_keyword_score = [0.0] * num_sentences
    for keyword in sufficient_keywords:
        total_keyword_score += keyword['score']
    for keyword in sufficient_keywords:
        sentence_keyword_score[
            keyword['sentence_num']] += keyword['score'] / total_keyword_score
    return sentence_keyword_score
Exemplo n.º 10
0
def extract_sentiment_nltk(reviews):
    rake = Rake("SmartStoplist.txt")
    for item_id, review in reviews:
        pos_keyword_list, neg_keyword_list = extract_keywords_with_sentiment(review, rake)
        print "pos: {}".format(pos_keyword_list)
        print "neg: {}".format(neg_keyword_list)
        print "\n"
Exemplo n.º 11
0
def extract_titles_rake(reviews):

    rake = Rake("SmartStoplist.txt")
    for item_id, review in reviews:
        print "Review: {}".format(review)
        title = extract_title_rake(review, rake)
        print title
        print "\n"
Exemplo n.º 12
0
def abstract_analyze(pdf, abstract):
    match_word_file = "Matchlist.txt"
    match = load_match_words(match_word_file)
    stop_words_path = "SmartStoplist.txt"
    r = Rake(stop_words_path)
    temp = r.run(abstract)
    matched = []
    for item in temp:
        if (item[1] >= 3):  #以分数3的界限分隔
            matched.append(item)
    matched = temp
    flag = False
    for item in matched:
        if (item[0] in match):
            list3.append(pdf)
            flag = True
            break
    if (flag == False):
        list4.append(pdf)
Exemplo n.º 13
0
def keyword_title(title_corpus):

    ## here we need NLTK stopwords and punkt, will storaged in /usr/share/nltk_data
    # uncomment to download

    #nltk.download('stopwords')
    nltk.download('punkt')

    title_dict = {}
    for t in title_corpus:
        key = (t[3], t[4])
        if key in title_dict:
            title_dict[key].append(t[1])
        else:
            title_dict[key] = []
            title_dict[key].append(t[1])

    # extract keywords with year span
    title_years = {}
    for k, v in title_dict.items():
        key = (k[0], )  # year index
        if key in title_years.keys():
            title_years[key].append(v)
        else:
            title_years[key] = []
            title_years[key].append(v)

    for k, v in title_years.items():
        r = Rake()
        vs = [item.rstrip('\n') for sublist in v for item in sublist]
        # a list of strings where each string is a sentence
        #r.extract_keywords_from_sentences(vs)
        #print('The keywords for year:{}'.format(str(k[0])))
        #print(r.get_ranked_phrases_with_scores()[0:10])

        title_txt = '''.'''.join(vs)
        title_txt.strip('\n')
        r.extract_keywords_from_text(title_txt)
        print('The keywords for year:{}'.format(str(k[0])))
        # to get keyword phrases ranked from hightest to lowest with scores
        print(r.get_ranked_phrases_with_scores()[0:10])
Exemplo n.º 14
0
def index():
        if request.method == "POST":

                job_description = request.form["description"]
                job_title = request.form["title"]
                
                rake = Rake("all_stop_words.txt")
                keyword_tuples = rake.run(job_description)
                keyword_dict = turn_tuple_to_dict(keyword_tuples)
                
                important_sentences = summarize(job_title, job_description) 
                
                common_words = get_common_words(keyword_dict, important_sentences)

               
                return render_template("results.html", 
                                    keywords=keyword_dict, 
                                    summaries=important_sentences,
                                    common_words = common_words)

        return render_template('index.html')
Exemplo n.º 15
0
def get_keyword(text):
    rake = Rake("SmartStoplist.txt")
    if text == "":
        return ""
    keywords = rake.run(text)
    return keywords[0][0]
Exemplo n.º 16
0
from pandas import DataFrame

#given the list of stopwords from nltk.corpus.stopwords.words and also added some more based on the taken text
stopwords=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't",
 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't",'across',
 'needs','called','together','creates','tells','yet','1996','shows','following','discussed']

#given the list of punctuations based on string.punctuations and also added some more based on the text
punctuations=['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>','."',';}', '(...);',
 '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '<>', '[]', '()', '/*', '*/', '("', '")', ');', '//', '...','!"','•',"''",'",','""','','[])','".',
 '<<','>>','<<<','!)','(/*','*)','().','();','==']


file = open("jbn.txt","r") #text taken as data
text = file.read()
r = Rake(punctuations=punctuations,stopwords=stopwords,ranking_metric=2)
r.extract_keywords_from_text(text)

#if words are to be stored in a text file
# file1 = open("keywords.txt","w")
# for i in range(len(r.rank_list)//2): #half of the list of phrases found are considered to be as keywords 
# 	file1.write(str(r.rank_list[i][0])+"  "+r.rank_list[i][1])
# 	file1.write("\n")
# file1.close()

#if words are to be stored in an excel sheet

keywords = []
weights = []
for i in range(len(r.rank_list)//2): #half of the list of phrases found are considered to be as keywords 
	keywords.append(r.rank_list[i][1])
Exemplo n.º 17
0
def get_key_phrases(document, stop_list):
    r = Rake(stop_list)
    keywords = r.run(document.lower())

    phrase_list = [word[0] for word in keywords if len(word[0].split(" ")) < 4]
    return phrase_list
Exemplo n.º 18
0
    'lol': ['league of legends', 'lol', 'riot'],
    'dota2': [
        'dota', 'dota2', 'defense of the ancients',
        'defense of the ancients 2', 'the international', 'ti7'
    ],
    'csgo': [
        'csgo', 'counter-strike', 'counter strike', 'cs-go',
        'counter-strike:global offensive'
    ],
    'overwatch': ['overwatch'],
    'wow': ['wow', 'world of warcraft'],
    'hots': ['hots', 'heroes of the storm'],
    'sc': ['starcraft 2', 'starcraft', 'sc2']
}

rake = Rake("SmartStoplist.txt", max_words_length=MAX_WORD_LENGTH)


def similar(a, b):
    if type(a) is list:
        similar_keywords = 0
        for a_val in a:
            for b_val in b:
                if Levenshtein.ratio(a_val, b_val) > KEYWORD_SIMILARITY_RATIO:
                    similar_keywords += 1
        return similar_keywords >= SIMILAR_KEYWORD_REQ
    return Levenshtein.ratio(a, b) > KEYWORD_SIMILARITY_RATIO


def remove_tags(text):
    TAG_RE = re.compile(r'<[^>]+>')
def extract_product(html_content, url):
    #String Buffer
    string_buffer = ""
    errs = list()

    #Read page and read to extract product infomation
    parser = BeautifulSoup(html_content, "html.parser")

    #Check if the page is a product, if not skip page.
    truth, asin = check_page(parser)
    if not truth:
        errs.append("Not product")
        return (False, errs)

    #New Product as a object
    product = Product()
    #New Keyword rank
    keyword = Rake(SmartStopList.words())

    #Find URL
    product.SetUrl(url)

    #Find Brand: Note: Some products have an image for the brand
    truth, string_buffer = search_table(
        parser, {"id": "productDetails_techSpec_section_1"}, "Brand Name")
    if truth:
        product.SetBrand(string_buffer)
    else:
        string_buffer = parser.find("a", attrs={"id": "brand"})
        if string_buffer != None:
            product.SetBrand(string_buffer.get_text().strip())
        else:
            errs.append("Could not find Brand")

    #Find Title
    string_buffer = parser.find("span", attrs={"id": "productTitle"})
    if string_buffer != None:
        product.SetTitle(string_buffer.get_text().strip())
    else:
        errs.append("Could not find Title")
        return (False, errs)

    #Find Image
    string_buffer = parser.find("img", attrs={"id": "landingImage"})
    if string_buffer != None:
        string_buffer = string_buffer.get("data-old-hires")
        if len(string_buffer) < 2:
            string_buffer = parser.find("img", attrs={
                "id": "landingImage"
            }).get("data-a-dynamic-image")
            m = re.search('https://(.+?).jpg', string_buffer)
            if m:
                string_buffer = m.group(1)
                string_buffer = "https://{}.jpg".format(string_buffer)
        #print ("Img Url: "+string_buffer)
        product.SetImage(string_buffer)
    else:
        errs.append("Could not find Image")

    #Find Small Blob
    #TODO: Need to perform keyword analysis
    string_buffer = parser.find("div", attrs={"id": "feature-bullets"})
    if string_buffer != None:
        string_buffer = string_buffer.find("ul")
    try:
        string_buffer = string_buffer.find_all("li")
        if string_buffer != None:
            string_buffer_2 = ""
            for span in string_buffer:
                string_buffer_3 = span.find("span")
                if string_buffer_3 != None:
                    string_buffer_3 = string_buffer_3.get_text()
                    try:
                        string_buffer_2 = "{} {}".format(
                            string_buffer_2, string_buffer_3.strip())
                    except:
                        pass
            saved_buffer = string_buffer_2.strip()
            #Calculating Key Words
            keywords_1 = keyword.run(saved_buffer)
            product.SetSmallBlog(keywords_1)
    except:
        errs.append("Error finding li")

    else:
        errs.append("Could not find small section keywords")

    #Find Large Blob
    #TODO: Need to perform keyword analysis
    string_buffer = parser.find("div", attrs={"id": "productDescription"})
    if string_buffer != None:
        string_buffer = string_buffer.find("p")
    if string_buffer != None:
        string_buffer = string_buffer.get_text()
        saved_buffer = string_buffer.strip()
        #Calculating Key Words
        keywords_2 = keyword.run(saved_buffer)
        product.SetLargeBlob(keywords_2)
    else:
        errs.append("Could not find large section keywords")

    #Find ASIN
    product.SetSourceID(asin)

    #TODO: Perform price save!

    #Append the product to large list of products
    if product.FormCompleted():
        return (product, errs)
    else:
        return (False, errs)
Exemplo n.º 20
0
from rake import Rake
from nltk.corpus import stopwords

starttime = datetime.datetime.now()
ger_stop_words = stopwords.words('german')
stop_words = stopwords.words('english')
stop_words.extend(ger_stop_words)
stop_words.extend(['via', 'using', 'fr'])
r = Rake(stop_words)
with open(r'dblp_index/title.dat', encoding='utf-8') as f_title:
    titles = []
    for line in f_title:
        titles.append(line)
r.extract_keywords_from_sentences(titles)
print('generate keywords', end='', flush=True)
with open(r'dblp_index/keywords.dat', 'a', encoding='utf-8') as keywords:
    with open(r'dblp_index/title.dat', encoding='utf-8') as titles:
        i = 0
        for line in titles:
            i += 1
            if i % 10000 == 0:
                print('.', end='', flush=True)
            phrases = r._generate_phrases(line)
            phrases_scores = []
            for phrase in phrases:
                true_phrase = '_'.join(phrase)
                score = r.phrase_score[true_phrase]
                phrases_scores.append(true_phrase + ":" + str(score))
            keywords.write(','.join(phrases_scores) + '\n')
print('done')
Exemplo n.º 21
0
from rake import Rake

rake = Rake()
text = "杉山古墳(すぎやまこふん)は、奈良県奈良市大安寺にある古墳。形状は前方後円墳。大安寺古墳群を構成する古墳の1つ。国の史跡に指定されている(史跡「大安寺旧境内 附 石橋瓦窯跡」のうち)。"
print(rake.get_keywords(text, 3))
"""
Output Keyword List

['ぎやまこふん', '前方後円墳', '大安寺']
"""
Exemplo n.º 22
0
 def __getMainWords__(self, userInput):
     rake = Rake("SmartStoplist.txt")
     keywords = rake.run(userInput)
     return keywords
Exemplo n.º 23
0
#coding: utf-8

#util
from read_conf import config
import csv
from optparse import OptionParser
import cPickle as pickle
import os
import re
from itertools import combinations

#rake
from rake import Rake
rake = Rake()

#nltk
import nltk
from nltk.util import clean_html
from nltk.util import clean_url

#nlp
from nlp import nlp
mnlp = nlp()

tag_re = re.compile(r"<p>(.+?)</p>", re.DOTALL)

dp = config("../conf/dp.conf")


#这个函数的作用是去重
#先读取title,然后和test的title相对比,看看有没有重的
import sys
import csv_io
import sets

import parser

reload(sys)
sys.setdefaultencoding("utf-8")

from textblob import TextBlob
from collections import Counter
from rake import Rake

results = {}

rake = Rake("SmartStoplist.txt")

users = parser.getUsers()

with open("data/edinburgh_restaurant_reviews.json") as f:
    data = json.loads(f.readline())

pos_polarity = 0
neg_polarity = 0
for business_id in data:
    results[business_id] = {}
    for review in data[business_id]:
        b = TextBlob(review["text"])
        if b.sentiment.polarity >= 0:
            pos_polarity += b.sentiment.polarity
        else:
Exemplo n.º 25
0
	word_type = tagged[1]
	if word_type in interest_types:
		if (tagged[0] not in extracted and tagged[0] != ""):
			extracted.append(tagged[0])

importantwords = ', '.join(extracted)
	
# print (importantwords)

fdist = FreqDist(extracted)

# print (fdist)

# print (fdist.most_common(50))

rake = Rake("SmartStoplist.txt")

keywords = rake.run(sentence)

# print (keywords)

for keyword in keywords:
	word = keyword[0]
	# print (word)

response = requests.get('http://en.wikipedia.org/wiki/Led_Zeppelin');

soup = BeautifulSoup(response.text)

content = soup.find(id='mw-content-text')
Exemplo n.º 26
0
chapter_tree += [{
    'sno': '[\d]*',
    'level': -1,
    'title': 'Exercises',
    'pno': chapter_tree[-1]['pno'] + 1
}]  # to ensure the last section is processed as per my current logic

try:
    os.stat('resources/' + pdfname)
except:
    os.mkdir('resources/' + pdfname)
finally:
    os.chdir('resources/' + pdfname)

unigram_rake = Rake('../stopwords.txt', 3, 1, 3)
bigram_rake = Rake('../stopwords.txt', 3, 2, 3)
trigram_rake = Rake('../stopwords.txt', 3, 3, 2)

keywords = set()
if split_mode == CHAPTER_MODE:
    chapter_tree = filter(lambda x: x['level'] == 1, chapter_tree)

pages = pages[40:]

skip = []

preprocessed_sections = []
for i, (cur_topic,
        next_topic) in enumerate(zip(chapter_tree[:-1], chapter_tree[1:])):
    if next_topic['level'] != -1:
Exemplo n.º 27
0
    dirname = pdfname

try:
    os.stat('../resources/' + dirname)
except:
    os.mkdir('../resources/' + dirname)
finally:
    os.chdir('../resources/' + dirname)

with open('__Sections.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Section No.', 'Level', 'Section', 'Page No.'])
    writer.writerows([[c['sno'], c['level'], c['title'], c['pno']]
                      for c in chapter_tree])

unigram_rake = Rake('../stopwords.txt', 3, 1, 3)
bigram_rake = Rake('../stopwords.txt', 3, 2, 3)
trigram_rake = Rake('../stopwords.txt', 3, 3, 2)

keywords = set()
if split_mode == CHAPTER_MODE:
    chapter_tree = list(
        filter(lambda x: int(x['level']) in [1, -1], chapter_tree))
else:
    chapter_tree = list(
        filter(lambda x: int(x['level']) in [1, 2, -1], chapter_tree))

preprocessed_sections = []
for i, (cur_topic,
        next_topic) in enumerate(zip(chapter_tree[:-1], chapter_tree[1:])):
    if next_topic['level'] != -1:
Exemplo n.º 28
0
def getRakeKeywords(doc):
    r = Rake(path.join('', cur_dir+'/SmartStoplist.txt'))
    candidates = r.run(open(doc).read().replace('\n',' '))
    return candidates[:300]
Exemplo n.º 29
0
    word_type = tagged[1]
    if word_type in interest_types:
        if (tagged[0] not in extracted and tagged[0] != ""):
            extracted.append(tagged[0])

importantwords = ', '.join(extracted)

# print (importantwords)

fdist = FreqDist(extracted)

# print (fdist)

# print (fdist.most_common(50))

rake = Rake("SmartStoplist.txt")

keywords = rake.run(sentence)

# print (keywords)

for keyword in keywords:
    word = keyword[0]
    # print (word)

response = requests.get('http://en.wikipedia.org/wiki/Led_Zeppelin')

soup = BeautifulSoup(response.text)

content = soup.find(id='mw-content-text')
Exemplo n.º 30
0
from ensemble import apply_filters

# from ensemble import ensemble_predictions
num_cores = multiprocessing.cpu_count()
chunk_size = 11000
core_id = 4
offset = 5230

preprocessed_lines = open("resources/data/social_text_tokenized_pos",
                          "r").readlines()
rake_data_lines = open("resources/data/social_text_tokenized", "r") \
    .readlines()
lemma_lines = open("resources/data/social_text_tokenized_lemma",
                   "r").readlines()

rake = Rake("resources/CombinedStopList")
#rake_out_f = open("resources/data/social_text_tokenized_rake_filtered_" + str(core_id + 1), "a")
topic_rank_out_f = open(
    "resources/data/social_text_tokenized_pos_topicrank_filtered_" +
    str(core_id + 1), "a")
kpminer_out_f = open(
    "resources/data/social_text_tokenized_pos_kpminer_filtered_" +
    str(core_id + 1), "a")

input_file_path = "resources/sample_jd_preprocessed"
meta_file_path = "resources/sample_jd_meta_data"

freq_unigrams = get_freq_words()

new_preprocessed_lines = preprocessed_lines[
    chunk_size * core_id +
Exemplo n.º 31
0
 def __init__():
     self.model = Rake()
Exemplo n.º 32
0
import pandas as pd,numpy as np
import pdb
from nltk import RegexpTokenizer
import re
from Tokenizers import SynonymTokenizer
from Tokenizers import SynonymStemTokenizer
from Tokenizers import StemTokenizer
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from rake import Rake
from nltk.stem import WordNetLemmatizer
from nltk.tag.stanford import NERTagger

rr = Rake()
class AnswerProcessor(object):
    def __init__(self,query,answers,score=None):
        '''answers are a list of strings, query is a string,score is a list of scores for each answer '''
        self.query=query
        self.answers=answers
        self.score = score
        #self.question_types = {'who':'PERSON','whom':'PERSON','whose':'PERSON','where':'LOCATION'\
        #	,'when':('DURATION','DATE'),'how+adj/adv':'NUMBER','how long':'DURATION','how many':'NUMBER','how much':'NUMBER'}
        self.question_types = {'who':'PERSON','whom':'PERSON','whose':'PERSON','where':'LOCATION'\
        	,'when':'CD','how+adj/adv':'CD','how long':'CD','how many':'CD','how much':'CD'}
                ###for what, next noun will be the thing we are searching for
        self.question_type=None
        self.query_tag=None
        self.answers_tag=None
    def stringProcessing(self,only_query=1):
        ''' query is a string, answers is a list of strings. returns tuples with tags, with a list covering '''