Пример #1
0
    def test_repeated_keywords(self):
        text = self._get_text_from_test_data("testrepeatedkeywords.txt")

        kwds = keywords(text)
        self.assertTrue(len(kwds.splitlines()))

        kwds_u = keywords(utils.to_unicode(text))
        self.assertTrue(len(kwds_u.splitlines()))

        kwds_lst = keywords(text, split=True)
        self.assertTrue(len(kwds_lst))
Пример #2
0
    def test_keywords_runs(self):
        text = self._get_text_from_test_data("mihalcea_tarau.txt")

        kwds = keywords(text)
        self.assertTrue(len(kwds.splitlines()))

        kwds_u = keywords(utils.to_unicode(text))
        self.assertTrue(len(kwds_u.splitlines()))

        kwds_lst = keywords(text, split=True)
        self.assertTrue(len(kwds_lst))
Пример #3
0
    def test_keywords_ratio(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
            text = f.read()

        # Check ratio parameter is well behaved.  Because length is taken on tokenized clean text
        # we just check that ratio 20% is twice as long as ratio 10%
        # Values of 10% and 20% were carefully selected for this test to avoid
        # numerical instabilities when several keywords have almost the same score
        selected_docs_12 = keywords(text, ratio=0.1, split=True)
        selected_docs_21 = keywords(text, ratio=0.2, split=True)

        self.assertAlmostEqual(float(len(selected_docs_21)) / len(selected_docs_12), float(21) / 12, places=1)
Пример #4
0
    def test_keywords_runs(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt")) as f:
            text = f.read()

        kwds = keywords(text)
        self.assertTrue(len(kwds.splitlines()))

        kwds_u = keywords(utils.to_unicode(text))
        self.assertTrue(len(kwds_u.splitlines()))

        kwds_lst = keywords(text, split=True)
        self.assertTrue(len(kwds_lst))
def results():
    # get data
    URLS = ['https://www.binance.com/en', 'http://www.supermap.com']
    ATTRIBUTES = ['description', 'keywords', 'Description', 'Keywords']
    collected_data = []
    res = []
    data = request.form['command']
    # ..............................................
    URLS = [data]
    for url in URLS:
        entry = {'url': url}
        try:
            r = requests.get(url)
        except Exception as e:
            res = 'Could not load page {}. Reason: {}'.format(url, str(e))
            print('Could not load page {}. Reason: {}'.format(url, str(e)))
            return render_template('results.html', predictions=res)
            continue
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, 'html.parser')
            meta_list = soup.find_all("meta")
            for meta in meta_list:
                if 'name' in meta.attrs.keys() and meta.attrs['name'].strip(
                ).lower() in ['description', 'keywords']:
                    name = meta.attrs['name']
                    entry[name.lower()] = meta.attrs['content']
            # if len(entry) == 3:
            collected_data.append(entry)
            # else:
            #     print('Could not find all required attributes for URL {}'.format(url))
            #     res = 'Could not find all required attributes for URL {}'.format(url)
            #     return render_template('results.html',predictions=res)
        else:
            print('Could not load page {}.Reason: {}'.format(
                url, r.status_code))
            res = 'Could not load page {}.Reason: {}'.format(
                url, r.status_code)
            return render_template('results.html', predictions=res)
    print('Collected meta attributes (TODO - push to DB):')
    for entry in collected_data:
        print(entry)
        print("Summary ")

        # Textrank method
        print(keywords(str(entry)).split('\n'))
        print('\n')
        # KeyBERT method
        from keybert import KeyBERT
        model = KeyBERT('distilbert-base-nli-mean-tokens')
        print(
            model.extract_keywords(str(entry),
                                   keyphrase_ngram_range=(1, 2),
                                   stop_words=None))
        print('\n')
        res = model.extract_keywords(str(entry),
                                     keyphrase_ngram_range=(1, 2),
                                     stop_words=None)

    return res
Пример #6
0
    def extract_keywords(self, text):
        """
        Extracts keyword from given text based on gensim word scoring.
        Set ratio to 0.1 to return highest scoring keywords.

        return a list of keywords.
        """
        return keywords(text, split=True, ratio=0.1)
Пример #7
0
def SummarizerAndkeyworder(text):
    #convert text into string format[explicit]
    text = str(text)
    print('\n\n#Summary:\n\n')
    summary=summarize(text,ratio=0.1)
    print(summary)
    print('\n\n#keywords\n\n')
    print(keywords(text,ratio=0.1))
Пример #8
0
 def textrank(self):
     string = " "
     stop_words = set(stopwords.words('english'))
     for sent in self.sents:
         for token in sent:
             if token not in stop_words:
                 string += " " + token
     return keywords(string).split('\n')
Пример #9
0
def keyWords_Labels_Matching(Country,gallery_id):
    DocList ,Data  = Load_GalLery_Textual_Data(Country,gallery_id)
    S1 ,Data1  = Load_GoogleVision_Labels(Country,gallery_id)
        
    data_lemmatized = [w for doc in PrepareData(DocList) for w in doc]
    
    print (data_lemmatized)
    
    fullStr = ' '.join(data_lemmatized)
    
    #labels = [Preprocessing(x['label']) for x in S1[0]]
    #labels.append(Preprocessing(S1[1]))

    labels = [w for label in PrepareData(S1) for w in label]
        
    setA = list(set(labels))
    
    setB = keywords(fullStr).split('\n')

    setB = [w for docs in PrepareData(setB) for w in docs]
  
    overlap = 0
    
    for l in setA:
        for w in setB:
            if fuzz.ratio(l, w) >= 75:
               overlap += 1
               
    universe = []
    
    uni = list(set(setA) | set(setB))
        
    for i in range(len(uni)):
        if uni[i] not in universe:
           universe.append(uni[i]) 
        for j in range(i+1,len(uni)):
            if fuzz.ratio(uni[i], uni[j]) >= 75 and uni[j] not in universe:
               universe.append(uni[j])
               
    universe = len(universe)
    
    labels = round(float(overlap) / len(setA) * 100., 2)
    comments = round(float(overlap) / len(setB) * 100., 2)
    overall = round(float(overlap) / float(universe) * 100., 2)
        
    #print ('overlap = ',overlap)
    #print ('universe = ',universe)
    
    #print ('\nLabels = ',len(setA))
    #print ('Comments = ',len(setB))

    #print ('overlap(Labels,Comments)/Labels = ',labels)
    #print ('overlap(Labels,Comments)/Comments = ',comments)
    
    print ('overlap(Labels,Comments)/Universe(Labels,Comments) = ',overall)
    
    
    return labels,comments,overall,setA,setB
Пример #10
0
def crwallNews():
    req = requests.get('https://www.reuters.com/news/world')
    req.encoding = 'utf-8'
    title = []
    title_kor = []
    keyword = []
    keyword_kor = []
    summary = []
    summary_kor = []
    upload_day = []
    href = []
    html = req.text
    soup = BeautifulSoup(html, 'html.parser')
    posts = soup.select('.story-content a ')

    r = Rake()

    for i in posts:
        if 'href' in i.attrs:
            plain_title = i.get_text().replace("\t", "").replace("\n", "")
            plain_href = 'https://www.reuters.com/news/world' + str(
                i.attrs['href'])

            # 본문 크롤링
            bsObject = BeautifulSoup(plain_href, "html.parser")
            body = bsObject.find_all('p', 'ArticleBody-para-TD_9x')
            bodyText = []  # 본문
            for i in body:
                bodyText.append(i)
            bodyText = str(bodyText)

            bodyText = re.sub('<.+?>', '', bodyText, 0, re.I | re.S)  # 태그 제거
            # 키워드 추출 문
            r.extract_keywords_from_text(bodyText)  # 본문의 키워드 추출
            words = r.get_ranked_phrases()
            keyword.append(''.join(keywords(
                words[0:3]).split('\n')))  # 총 3개 키워드 삽입

            #   summary.append(이거 요약) 요약문인가? 이거 어캐쓰는거임
            href.append(plain_href)
            title.append(plain_title)
            title_kor.append(''.join(eng2kr(i.get_text())))
            upload_day.append(datetime.datetime.utcnow())

    latest = pd.DataFrame({
        "href": href,
        "title": title,
        "title_kor": title_kor,
        "upload_day": upload_day
    })

    latest = latest.fillna(0)
    latest = latest[latest['title'].isin(findMongo()) == False]
    print(latest)
    latest.reset_index(inplace=True)
    data_dict = latest.to_dict("records")
    print(data_dict)
    return data_dict
Пример #11
0
def get_keywords(content):
    try:
        keys = keywords(content,
                        words=10,
                        split='\n',
                        pos_filter=('NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD',
                                    'VBG', 'VBN'),
                        lemmatize=True)
    except ZeroDivisionError:
        keys = []
    except IndexError as e:
        keys = keywords(content,
                        ratio=1,
                        split='\n',
                        pos_filter=('NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD',
                                    'VBG', 'VBN'),
                        lemmatize=True)
    return keys
Пример #12
0
def return_keywords(texts):
    xkeywords = []
    values = keywords(text=preprocess(texts), split='\n', scores=True)
    for x in values[:10]:
        xkeywords.append(x[0])
    try:
        return xkeywords
    except:
        return "no content"
Пример #13
0
    def post():
        posted_data = request.get_json()
        text = posted_data['text']
        
        text = (keywords(text))

        return jsonify({
            'Keywords': text
        })
def txtsummarise():

    txt = text.get('1.0', END)

    text1.delete('1.0', END)
    text1.insert(END, summarize(txt, ratio=0.3))

    text2.delete('1.0', END)
    text2.insert(END, keywords(txt, ratio=0.3))
Пример #15
0
def many_keywords_w2v(text):	
	'''Iterates over the words in one question: extracts the keywords, their scores, and their word vectors.'''
	keyword_list = keywords(text, ratio=0.2, words=None, split=False, scores=True, pos_filter=None, lemmatize=True, deacc=True)
	keyword_list_w2v = []
	for keyword ,score in keyword_list:
		word_vector = one_keyword_w2v(keyword)
		if type(word_vector) != int:
			keyword_list_w2v.append([keyword, score, word_vector])
	return keyword_list_w2v
Пример #16
0
def extract_keyphrase_list(text_string,ratio=0.5,min_phrase_length=1):
    keyword_list = keywords(text_string, ratio=ratio, split=True, scores=True,lemmatize=True)
    scale = 25.0
    filtered_keyword_list = list([])
    for keyword_tuple in keyword_list:
        keyword = keyword_tuple[0].lower().strip()
        if len(keyword.split(' ')) >= min_phrase_length:
            filtered_keyword_list.append(keyword)
    return filtered_keyword_list
Пример #17
0
def getKeywords(text):

    nltk.download('wordnet')
    tokenized = nltk.tokenize.word_tokenize(text)
    stemmer = SnowballStemmer("english", ignore_stopwords=False)
    rawkws = summary.keywords(text)
    keyws = rawkws.split()

    return keyws
Пример #18
0
def keywords_reshape(searchText, data_dict):
	"""
	Return a list of overarching topics
	"""
	running_kwds=[]
	for k,v in data_dict.items():
	    running_kwds = running_kwds + keywords(v, split=True)

	return list(set(running_kwds))
Пример #19
0
 def get_keywords_textrank(text):
     text_keys = keywords(text,
                          ratio=1,
                          lemmatize=True,
                          scores=True,
                          split=True,
                          pos_filter=())
     text_keys = [tup[0] for tup in text_keys[:10]]
     return text_keys
Пример #20
0
 def gen_keyword_list(self, para_list, ratio=1):
     keyword_list = []
     keyword_ratio = ratio
     print('generating default keyword...')
     for para in tqdm(para_list):
         line = re.sub("<unk>\s?", '', para)
         keyword_list.append(keywords(line, ratio=keyword_ratio,
                                      split=True))
     return keyword_list
    def test_keywords_ratio(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.open(os.path.join(pre_path, "mihalcea_tarau.txt"),
                        mode="r") as f:
            text = f.read()

        # Check ratio parameter is well behaved.  Because length is taken on tokenized clean text
        # we just check that ratio 20% is twice as long as ratio 10%
        # Values of 10% and 20% were carefully selected for this test to avoid
        # numerical instabilities when several keywords have almost the same score
        selected_docs_12 = keywords(text, ratio=0.1, split=True)
        selected_docs_21 = keywords(text, ratio=0.2, split=True)

        self.assertAlmostEqual(float(len(selected_docs_21)) /
                               len(selected_docs_12),
                               float(21) / 12,
                               places=1)
Пример #22
0
    def _prepare_keywords(self, filepath=None):
        if filepath is None:
            filepath = MSG_FILE
        text = ''
        for line in get_lines(filepath):
            if '?' in line:
                text += line

        self.kwds = set(keywords(text).split())
Пример #23
0
def get_speech():
    global out
    driver.get("https://www.moneycontrol.com/annual-report/" +
               company_name+"/directors-report/"+cd+"#"+cd)
    
    # director_speech
    director_speech = driver.find_element_by_xpath(
        '//div[@class="report_data"]').text
    director_speech
    d = re.match('.*\\n', director_speech).group()
    # ds = director_speech.rstrip("\n")
    # ds=re.sub('\n',' ',director_speech)
    ds = re.sub(d, ' ', director_speech)

    driver.get("https://www.moneycontrol.com/annual-report/" +
               company_name+"/chairmans-speech/"+cd+"#"+cd)
    chairman_speech = driver.find_element_by_xpath(
        '//div[@class="report_data"]').text
    c = re.match('.*\\n', chairman_speech).group()
    # cs=re.sub('\n',' ',chairman_speech)
    cs = re.sub(c, " ", chairman_speech)

    ds_keyword_list = keywords(ds, words=20, split=True, lemmatize=True)
    cs_keyword_list = keywords(cs, words=20, split=True, lemmatize=True)
    # keywords from whole chairman's speech
    # print(keyword_list)

    ds_keyword_tags = dict(nltk.pos_tag(ds_keyword_list))
    cs_keyword_tags = dict(nltk.pos_tag(cs_keyword_list))

    ds_keywords_final = [
        word for word in ds_keyword_tags.keys() if ds_keyword_tags[word] == 'NN']
    cs_keywords_final = [
        word for word in cs_keyword_tags.keys() if cs_keyword_tags[word] == 'NN']
    # also need to remove company name if there in the list

    # print(keywords_final[:5])
    # summarization
    ds_summ = summarize(ds, word_count=100)
    cs_summ = summarize(cs, word_count=100)

    out = json.dumps([{'summary': cs_summ, 'keywords': cs_keywords_final[:5], 'fullCont':cs}, {
                     'summary': ds_summ, 'keywords': ds_keywords_final[:5], 'fullCont':ds}])
    return out
Пример #24
0
def crwallNews():
    req = requests.get('https://www.reuters.com/news/world')
    req.encoding = 'utf-8'
    title = []
    title_kor = []
    keyword = []
    keyword_kor = []
    summary = []
    summary_kor = []
    upload_day = []
    r = Rake()
    href = []
    html = req.text
    soup = BeautifulSoup(html, 'html.parser')
    posts = soup.select('.story-content a ')
    for i in posts:
        if 'href' in i.attrs:
            plain_title = i.get_text().replace("\t", "").replace("\n", "")
            plain_href = 'https://www.reuters.com/news/world' + str(
                i.attrs['href'])
            plain_text = crwallbody(plain_href)
            # summary
            r.extract_keywords_from_text(plain_text)
            summary_temp = ','.join(r.get_ranked_phrases()[:3])
            summary.append(summary_temp)
            summary_kor.append(''.join(eng2kr(summary_temp)))
            # keyword
            keyword_temp = ','.join(keywords(plain_text).split('\n')[:3])
            keyword.append(
                keyword_temp.replace('reuters',
                                     '').replace('news',
                                                 '').replace('provider', ''))
            keyword_kor.append(''.join(eng2kr(keyword_temp)))
            href.append(plain_href)
            title.append(plain_title)
            title_kor.append(''.join(eng2kr(i.get_text())))
            upload_day.append(datetime.datetime.utcnow())

    latest = pd.DataFrame({
        'href': href,
        'title': title,
        'title_kor': title_kor,
        'summary': summary,
        'keyword': keyword,
        'keyword_kor': keyword_kor,
        'summary_kor': summary_kor,
        'upload_day': upload_day,
        'plain_text': plain_text
    })
    latest = latest.fillna(0)
    latest = latest[latest['title'].isin(findMongo()) == False]
    print(latest)
    latest.reset_index(inplace=True)
    data_dict = latest.to_dict("records")
    print(data_dict)
    return data_dict
Пример #25
0
    def test_text_keywords_words(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
            text = f.read()

        # calculate exactly 13 keywords
        generated_keywords = keywords(text, words=15, split=True)

        self.assertEqual(len(generated_keywords), 16)
Пример #26
0
    def test_text_summarization_raises_exception_on_short_input_text(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
            text = f.read()

        # Keeps the first 8 sentences to make the text shorter.
        text = "\n".join(text.split('\n')[:8])

        self.assertTrue(keywords(text) is not None)
 def get_keywords(self,text,ratio):
     # Bu methodda ise, gensim kütüphanesinin anahtar kelime çıkarım mekanizması kullanılarak,
     # metindeki en önemki stop word olmayan kelimelerin bulunmasını hedefledik
     x = self.cleanText(text)
     text_keywords = keywords(text,ratio=ratio).split("\n")
     valid_keywords = []
     for keyword in text_keywords:
         if keyword not in self.stop_words:
             valid_keywords.append(keyword)
     return valid_keywords
Пример #28
0
def analyze(message):
    keyword = ''
    summary = ''
    alert = ''
    Polarity = ''
    Subjectivity = ''
    if len(message) < 2:
        alert = 'enter text more than 2 character'
        print(alert)
    elif len(message) > 2 and len(message) < 100:
        overview = TextBlob(message)
        Polarity = round(overview.sentiment.polarity, 2)
        Polarity = str(Polarity)
        Subjectivity = round(overview.sentiment.subjectivity, 2)
        Subjectivity = str(Subjectivity)
    elif len(message) > 100 and len(message) < 250:
        Keywords = keywords(message)
        Keywords = str(Keywords)
        new = Keywords.split('\n')
        for word in new:
            keyword = keyword + word + ', '
        overview = TextBlob(message)
        Polarity = round(overview.sentiment.polarity, 2)
        Polarity = str(Polarity)
        Subjectivity = round(overview.sentiment.subjectivity, 2)
        Subjectivity = str(Subjectivity)
    elif len(message) > 250:
        Keywords = keywords(message)
        Keywords = str(Keywords)
        new = Keywords.split('\n')
        for word in new:
            keyword = keyword + word + ', '
        summary = summarize(message)
        overview = TextBlob(message)
        Polarity = round(overview.sentiment.polarity, 2)
        Polarity = str(Polarity)
        Subjectivity = round(overview.sentiment.subjectivity, 2)
        Subjectivity = str(Subjectivity)

    lst = []
    lst.extend([Polarity, Subjectivity, summary, keyword])

    return (lst)
Пример #29
0
def keywords_from_msgs(messages: List[Message], method='gensim') -> List[str]:

    text = whole_text(messages)

    if method == 'RAKE':
        return topic_extraction_rake(text)
    elif method == 'gensim':
        return keywords(text)
    elif method == 'mglda':
        raise NotImplementedError("Not Yet")
Пример #30
0
def gensim_textrank_keywords(x_train,
                             x_test=None,
                             list_of_cols=[],
                             new_col_name="_extracted_keywords",
                             **algo_kwargs):
    """
    Uses Gensim Text Rank summarize to extract keywords.

    Note this uses a variant of Text Rank.
    
    Parameters
    ----------
    x_train : DataFrame
        Dataset

    x_test : DataFrame
        Testing dataset, by default None

    list_of_cols : list, optional
        Column name(s) of text data that you want to summarize

    new_col_name : str, optional
        New column name to be created when applying this technique, by default `_extracted_keywords`
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with the new column.

    Returns 2 Dataframes if x_test is provided. 
    """

    for col in list_of_cols:
        x_train.loc[:, col + new_col_name] = [
            keywords(x, **algo_kwargs) for x in x_train[col]
        ]

        if x_test is not None:
            x_test.loc[:, col + new_col_name] = [
                keywords(x, **algo_kwargs) for x in x_test[col]
            ]

    return x_train, x_test
Пример #31
0
    def reumir(self, text):
        res = summarize(text, ratio=0.1)

        print res

        res2 = summarize(text, word_count=100)

        print res2

        print(keywords(ratio=0.1))
Пример #32
0
def getKeywords(text):
    """
    Get keywords of text with the count of the number of times they appear
    """
    # TODO Add plural stripping (convert plural words to singular to help reduce number of dimensions)
    kwordsCount = {}
    kwords = keywords(text).strip().split('\n')
    for word in kwords:
        kwordsCount[word] = text.count(word)
    return kwordsCount
Пример #33
0
def gensim_keywords():
    text = "Recently, I registered with the GP across the road from my flat. I’ve lived there for a year. I had put off registering beforehand due to a severe allergy to bureaucracy.  Last year, I wanted to go see a specialist. I snored at my private healthcare provider’s response that I first needed to have a consultation with my doctor… Safe to say it never happened. ‘Access’, in healthcare, tends to mean availability of care, and often comes down to the affordability of the care, and the size of the audience that have access to it. My anecdotes are obviously examples of minor friction, rather than any ‘real’ issues with access to healthcare, but in the ‘on demand’ world we live in, where easy sign-up and zero-friction on-boarding is king, access also means getting the right medical ‘product’ at the time you need it. This is the advancement in healthcare that I’m most excited about. Products are now being delivered directly to the patient – which is preferable to leaving them at the mercy of the doctor’s calendar." \
           "The new meaning of ‘over the counter’ In grocery stores fifty years ago, there used to be a clear divide between the consumers and the goods. Groceries could be accessed only through the shop clerk, who was the gatekeeper. It was useful to have someone from whom to advice, but over time, it became clear that it was a far better to be allowed to make their own decisions. The same switch is now happening in healthcare. Due to the improved access to information in the ‘WebMD’ era, people often have an idea of what is wrong with them when they are ill. Also, they are generally health-conscious, and therefore keen to ensure they remain healthy today’ so they don’t become a patient tomorrow. Due to this change in behavior and improved access to information, there is now room in the market for consumer-grade healthcare products and digital tools to ensure these products are available to large audiences, in new ways. Those who successfully build consumer products and brands in healthcare will win big in the next few years. Medical data is ripe for change. Our data is currently hidden in disparate patient records, but is becoming our own again thanks to platforms like PatientsKnowBest (full disclosure, Balderton is invested). Digital tools are also changing how we manage diseases. Behavior change programs are being turned into products, and scaled across a previously impossible large number of patients. This is particularly powerful when tackling healthcare problems that affect a nation of people, and emanate from poor lifestyle."

    print('Keywords: ')

    pos_filter = [
        'JJ', 'CC', 'CD', 'DT', 'JJ', 'EX', 'RB', 'WRB', 'WP$', 'WP', 'VB'
    ]
    print(keywords(text, ratio=0.1, pos_filter=pos_filter, split=True))
Пример #34
0
def get_summary(text):
    text = str(text)
    all_text = ''.join(text).replace(']', '').replace('[', '').replace("'", '')
    all_text = re.sub(r'\[[0-9]*\]', ' ', all_text)

    key_words = keywords(all_text, lemmatize=True, words=20).split('\n')
    summary = summarize(all_text, ratio=0.01)
    first_summary_sentence = summary.split('.')[0]

    return key_words, summary, first_summary_sentence
Пример #35
0
    def _get_tags_from_cluster_summaries(self, cluster_summaries):
        summaries = []
        for cluster_id, cluster_summary in cluster_summaries:
            if cluster_summary == self.ERROR_NOT_ENOUGH_POSTS_FOR_TAGS:
                continue

            summaries.append(cluster_summary)

        summaries = [' '.join([word for word in text.lower().split() if word not in self.corpus.stop_words]) for text in summaries]

        return list(enumerate(keywords('. '.join(summaries), split=True, words=15)))
Пример #36
0
    def test_text_keywords_pos(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
            text = f.read()

        # calculate keywords using only certain parts of speech
        generated_keywords_nnvbjj = keywords(text, pos_filter=['NN', 'VB', 'JJ'], ratio=0.3, split=True)

        # To be compared to the reference.
        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f:
            kw = f.read().strip().split("\n")

        self.assertEqual({str(x) for x in generated_keywords_nnvbjj}, {str(x) for x in kw})
Пример #37
0
    def test_text_keywords(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
            text = f.read()

        # calculate keywords
        generated_keywords = keywords(text, split=True)

        # To be compared to the reference.
        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f:
            kw = f.read().strip().split("\n")

        self.assertEqual({str(x) for x in generated_keywords}, {str(x) for x in kw})
Пример #38
0
 def test_text_keywords_with_small_graph(self):
     # regression test, we get graph 2x2 on this text
     text = 'IT: Utilities A look at five utilities to make your PCs more, efficient, effective, and efficacious'
     kwds = keywords(text, words=1, split=True)
     self.assertTrue(len(kwds))
Пример #39
0
                doc_id, text_body = doc_id_text_generator.next()
            except StopIteration:
                print 'not enough docs found, breaking'
                break
            concat_txt = ' '.join([concat_txt, text_body[:args.single_doc_len]])
            breakout += 1

        print 'used %i concatenated docs for this topic' % breakout
        print 'actual character length of concatenated docs: %i' % len(concat_txt)

        # make sure you have something
        if len(concat_txt) == 0:
            print 'got nothing for this topic'
            continue

        # TODO: make arga
        generate_keywords = True
        generate_sentences = True

        if generate_keywords:
            print '\ngenerating keywords\n------------------------------\n'
            summary = keywords(concat_txt, ratio=args.summary_ratio, split=True, lemmatize=True)
            print ', '.join(summary)
        if generate_sentences:
            print '\ngenerating sentences\n------------------------------\n'
            summary = summarize(concat_txt, split=True, ratio=args.summary_ratio)
            for sentence in summary:
                print ' * ' + sentence

        # it's sentence or keyword depending on --sentence flag
def kw(asp,pol):
	with open(path+asp+"/"+pol, 'r') as myfile:
				data=myfile.read()#.replace('\n', '.')
				kw=keywords(data)
				print "\n\n"+i+"\t\t:"+pol+"\n_______________________\n"+kw
Пример #41
0
 def test_text_keywords_without_graph_edges(self):
     # regression test, we get graph with no edges on this text
     text = 'Sitio construcción. Estaremos línea.'
     kwds = keywords(text, deacc=False, scores=True)
     self.assertFalse(len(kwds))
from gensim.summarization import summarize
from gensim.summarization import keywords
import pandas as pd 


if __name__ == "__main__":
	df = pd.read_csv('newtestament.txt', delimiter="|", skiprows=0, names=['Book', 'Chapter', 'Verse', 'Original Text'])
	text = df[df['Book'] == 'Mat']['Original Text'].values

	alltext = ''
	for verse in text:
	    alltext += verse

	print summarize(alltext, ratio = 0.005)
	print keywords(text, ratio = 0.01)