Пример #1
0
def stream_audio(room):
    if 'file' in request.files:
        _file = request.files['file']
        _file.save('./test.wav')
        audio_file = AudioSegment.from_file("test.wav", format="mp4")
        audio_file.export("test.wav", format="wav")
        try:
            with sr.AudioFile('./test.wav') as f:
                global curr_sent
                audio = r.record(f)
                text = r.recognize_google(audio)
                tokens = word_tokenize(text)
                global words
                words += tokens
                curr_sent += text + ' '
                print('text:', text)
                if text:
                    fb_root.child('room').child(room).child('transcribe').push(
                        text)
                if len(words) >= 30:
                    fb_root.child('room').child(room).child('cards').push(
                        keywords.keywords(' '.join(words)).split('\n'))
                    print('Keywords:',
                          str(keywords.keywords(' '.join(words)).split('\n')))
                    print('Sents:', '.'.join(sents))
                    #print('Summary:', summarizer.summarize(' '.join(words), words=15))
                    words = []
        except Exception as e:
            #global sents
            sents.append(curr_sent)
            curr_sent = ''
            print(e)
            return 'Error'
        return 'Got file'
    return 'No file'
Пример #2
0
 def test_text_as_bytes_raises_exception(self):
     # Test the keyword extraction for a text that is not a unicode object
     # (Python 3 str).
     text = get_text_from_test_data("spanish.txt")
     bytes = text.encode(encoding="utf-8")
     with self.assertRaises(ValueError):
         keywords(bytes, language="spanish")
Пример #3
0
 def test_text_as_bytes_raises_exception(self):
     # Test the keyword extraction for a text that is not a unicode object
     # (Python 3 str).
     text = get_text_from_test_data("spanish.txt")
     bytes = text.encode(encoding="utf-8")
     with self.assertRaises(ValueError):
         keywords(bytes, language="spanish")
Пример #4
0
def get_info(path):                                          #function to get number of pages of input pdf
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f)
        number_of_pages = pdf.getNumPages()
        text = convert_pdf_to_txt(path, pages=range(0,number_of_pages))        
        LANGUAGE = "english"
        SENTENCES_COUNT = 10
        #url = "https://en.wikipedia.org/wiki/Automatic_summarization"
        #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        # or for plain strings
        parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) #PlaintextParser.from_file for files
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            print(sentence)

        print(keywords.keywords(text))                  #original keywords
        keywordlist1=keywords.keywords(text,split=True)  #list of keywords
        keywordstring=" ".join(keywordlist1)             #string of keywords
        tokens= nltk.word_tokenize(keywordstring)

        print("\n",{stem(t) for t in tokens},"\n")            # gives set of unique kywords    
Пример #5
0
def key_sentences_extract(path):
    """
    提取语料库中的关键句子
    :param path:语料库路径
    :return:
    """
    sens = ""
    sens_list = []
    text_time_dict = {}  # 文本-时间戳字典
    with codecs.open(path, mode='r') as f:
        for line in f:
            line = line.strip('\n')
            line = line.split('\t')
            text = line[2].split(' ')
            times = line[0][:-3]

            # 词性还原
            new_sens = []
            for word in text:
                w = wn.morphy(word)
                if w is None:
                    w = word
                sens += " " + w
                new_sens.append(w)
            new_text = " ".join(new_sens)
            text_time_dict[new_text] = time.mktime(
                time.strptime(times, "%Y %m %d %H:%M"))
            sens += '.'
            sens_list.append(new_sens)
    print "***************keywords****************"
    print keywords.keywords(sens)
    # print text_time_dict
    key_sens = summarizer.summarize(sens, ratio=0.3)
    key_sens_list = key_sens.split('\n')
    key_sens_token_list = [
        sens.strip('.').split(' ') for sens in key_sens_list
    ]
    res = []
    for item in sens_list:
        score = 0.0
        if " ".join(item) not in key_sens_token_list:
            for sen in key_sens_token_list:
                score += similarity_jaccard(item, sen)  # jaccard相似度
                # score += similarity_textrank(item, sen)   # textrank相似度
            score /= len(key_sens_token_list)
            res.append([item, score])

    res = sorted(res, key=lambda x: x[1], reverse=True)
    # for item in res:
    #     print " ".join(item[0]), item[1]

    new_time = []
    for item in res[:int(len(res) * 0.8)]:
        key = " ".join(item[0])
        tt = text_time_dict[key]
        # print key, tt, item[1]
        new_time.append(tt)
    new_time.sort()
    return new_time
Пример #6
0
    def test_keywords_ratio(self):
        text = get_text_from_test_data("mihalcea_tarau.txt")

        # Check ratio parameter is well behaved.
        # Because length is taken on tokenized clean text we just check that
        # ratio 40% is twice as long as ratio 20%
        selected_docs_20 = keywords(text, ratio=0.2, split=True)
        selected_docs_40 = keywords(text, ratio=0.4, split=True)

        self.assertAlmostEqual(float(len(selected_docs_40)) / len(selected_docs_20), 0.4 / 0.2, places=1)
Пример #7
0
def convert_pdf_to_txt(path, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(path, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    sample = text

    stop_words = set(stopwords.words('english'))#stopwords

    word_tokens = word_tokenize(sample)

    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    
    filtered_sentence = []

    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    new_sentence = ' '.join(filtered_sentence)
    print(keywords.keywords(text))#original keywords
    txt=summarizer.summarize(text,ratio=0.2)#summary of original summary(SAY NEW)
    print(txt)
    print(summarizer.summarize(txt,ratio=0.2))
    summarylist=summarizer.summarize(text,split=True)#list of new
    summarystring=" ".join(summarylist)#string of NEW
    
    b=keywords.keywords(summarystring,split=True)#keywords obtained from string of NEW

    keywordlist1=keywords.keywords(text,split=True)#list of keywords
    keywordstring=" ".join(keywordlist1)#string of keywords
    keywordlist2=keywords.keywords(keywordstring,split=True)
    tokens= nltk.word_tokenize(keywordstring)
    print({stem(t) for t in tokens})      # gives set of unique kywords
    print(len({stem(t) for t in tokens})) # gives number of unique kywords


    return text
Пример #8
0
def getKeywordsByTextRank(text_list, num_keywords_to_generate_array):

    keywords_by_TextRank = [[]] * len(text_list)

    #for doc_num in range(len(text_list)):
    for doc_num in [75]:
        print doc_num
        tokenized_and_stemmed = ' '.join(
            tokenizeAndStemStrings(text_list[doc_num]))
        kw = keywords.keywords(tokenized_and_stemmed)
        keywords_by_TextRank[doc_num] = keywords.keywords(
            tokenized_and_stemmed).split(
                '\n')[0:num_keywords_to_generate_array[doc_num]]

    return keywords_by_TextRank
Пример #9
0
def Get_keyword(sentence):
    pre_sent = Preprocessing(sentence)  # 전처리

    key_word = keywords.keywords(pre_sent, ratio=0.3)  # 핵심 단어 30% 추출   (문자열)
    key_word = ' '.join(key_word.splitlines())  # keywords() 함수에서 생긴 개행 문자들 제거

    key_word_noun = Get_noun_list(key_word)  # 핵심 단어에서 명사만 추출 (리스트)
    #key_word_noun = key_word.split(' ') # 임시 코드

    for i, noun in enumerate(key_word_noun):  # 한 글자인 단어는 제거
        if len(noun) < 2:
            del key_word_noun[i]
    # print("key_word_noun : ", key_word_noun)

    # 핵심 문장의 명사 단어 중 핵심 단어의 명사 단어와 겹치는 것의 빈도 수 측정
    word_cnt = {}
    for i, noun in enumerate(key_word_noun):
        if noun in word_cnt.keys():
            word_cnt[noun] += 1
        else:
            word_cnt[noun] = 1

    # 빈도 수가 높은 순서로 정렬
    sorted_word_cnt = sorted(word_cnt.items(),
                             reverse=True,
                             key=lambda item: item[1])  # 리스트
    # print("sorted : ", sorted_word_cnt)

    num = min(5, len(sorted_word_cnt))  # 핵심 단어는 최대 5개
    final_key_word = []
    final_key_word.append([sorted_word_cnt[i][0] for i in range(num)])
    final_key_word = final_key_word[0]
    # print("final: ", final_key_word)

    return final_key_word
Пример #10
0
 def get_termscoreList_from_text(extractedText):
     if method == 'textrank':
         import ipdb; ipdb.set_trace()
         return keywords(extractedText, scores=False)
     r.extract_keywords_from_text(extractedText)
     score_term_List = r.get_ranked_phrases_with_scores()
     return [(t[1], t[0]) for t in score_term_List]
Пример #11
0
def indexing():
    genDoc()
    ct = 0
    for fi in File.objects.filter(isIndexed=False):
        try:
            setContent(fi)
        except:continue
        d = fi.doc
        if not d.desc: d.desc=summarize(fi.content,words=50)
        d.keywords = keywords(fi.content,words=5)
        setDate(d)
        d.loc = getLoc(fi.content)
        words = process(fi.content)
        for word, num in words.items():
            idx = None
            li = Wordindex.objects.filter(word=word)
            if li.exists():
                idx = li[0]
            else:
                idx = Wordindex.objects.create(word=word,index={})
            saved = idx.index # Warning:  'idx.index[d.id] = num'  is wrong
            saved[d.id] = num
            idx.index = saved
            idx.save()
        d.save()
        fi.isIndexed = True
        fi.save()
        if ct%20==0:print(ct)
        ct+=1
    return ct
Пример #12
0
def get_top_words(text):
    start = time()
    temp = keywords.keywords(text, ratio=RATIO)
    res = ' '.join(word for word in temp.split() if len(word) >= MIN_LETTERS)
    end = time()
    print 'Top words were counted in {diff} seconds'.format(diff=(end - start))
    return str(res)
Пример #13
0
	def extractKeywords(self, texts, num=5, metricsCount=False, partMatchesCounted=False):
		texts['textrank_keywords'] = ''

		if (metricsCount == True):
			texts['textrank_metrics'] = ''
			
			#sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
			#print([word[0] for word in sorted_words]
			#values = [word[0] for word in sorted_words[:num]]
			
		for index, row in tqdm(texts.iterrows(), total=texts.shape[0]):
			values = textrank.keywords(row.text, language=self.language, additional_stopwords=self.stopwords)
			values = values.split('\n')
			length = len(values)

			values = ','.join(random.sample(values, min(num, length)))
			texts.at[index, 'textrank_keywords'] = values

			if (metricsCount == True):
				if self.textprocessor.useLemmas == True:
					groundtruth = self.textprocessor.get_normal_form_text(row['Keywords']).split(',')
				else:
					groundtruth = row['Keywords'].split(',')
				
				metrics = evaluate_keywords(values.split(','),groundtruth,partMatchesCounted)
				texts.at[index, 'textrank_metrics'] = ','.join([str(m) for m in metrics])
Пример #14
0
def get_key(sentence):
    key_sent = summarize(sentence, ratio=0.1)   # ratio는 전체 문장 수에 비례하여 추출할 핵심 문장의 비율
    key_word = keywords.keywords(sentence, words=15)  # 핵심 단어 15개 추출

    key_sent_noun = morpheme(key_sent)  # 핵심 문장에서 명사만 추출
    key_word_noun = morpheme(key_word)  # 핵심 단어에서 명사만 추출
    #print("key_sent_noun : ", key_sent_noun)
    #print("key_word_noun : ", key_word_noun)


    # 핵심 문장의 명사 단어 중 핵심 단어의 명사 단어와 겹치는 것의 빈도 수 측정
    word_cnt = {}
    for noun in key_sent_noun:
        if noun in key_word_noun:
            if noun in word_cnt.keys():
                word_cnt[noun] += 1
            else:
                word_cnt[noun] = 1

    # 빈도 수가 높은 순서로 정렬
    sorted_word_cnt = sorted(word_cnt.items(), reverse=True, key=lambda item: item[1])  # 리스트
    #print("sorted : ", sorted_word_cnt)

    num = min(5, len(sorted_word_cnt))      # 핵심 단어는 최대 5개
    final_key_word = []
    final_key_word.append([sorted_word_cnt[i][0] for i in range(num)])
    final_key_word = final_key_word[0]
    #print("final: ", final_key_word)

    final_key_sentence = key_sent.split('.')[0]     # 핵심 문장은 1개

    return final_key_sentence, final_key_word
def get_result_list(lines):
    from summa import summarizer
    from summa import keywords
    import nltk
    nltk.download('punkt', download_dir='./nltk_data')
    nltk.data.path.append("./nltk_data")
    result_list = []
    for line in lines:
        json_data = json.loads(line)
        text = json_data["text"]
        abstract = summarizer.summarize(text)
        keyword = keywords.keywords(text, split=True)
        name = json_data['company']
        tags_words = list(map(lambda x: x[1:], json_data['tags']))
        abstract_words = list(
            map(lambda x: x.lower(), nltk.tokenize.word_tokenize(abstract)))
        title_words = list(
            map(lambda x: x.lower(),
                nltk.tokenize.word_tokenize(json_data['title'])))
        if abstract != '' and name not in abstract_words and name not in title_words and name not in tags_words:
            continue
        json_data['abstract'] = abstract
        json_data['keywords'] = keyword
        result_list.append(json.dumps(json_data))
    return result_list
Пример #16
0
    def post():
        posted_data = request.get_json()
        text = posted_data['text']

        text = (keywords.keywords(text).split('\n'))

        return jsonify({'Keywords': text})
def keywords_eng():
    from summa import keywords
    import re

    # from nltk.stem import PorterStemmer

    import sys

    # sys.stdout = open('keyword_eng_result.txt','w')

    # with open("ocr_pdf_result.txt") as f:
    #     fullText = f.read()

    # print(repr(fullText))

    # textrank 활용
    globalVariable.keywordArr = keywords.keywords(
        globalVariable.fullText_f).split('\n')

    globalVariable.keywordArr = globalVariable.keywordArr[:5]

    # print("stem 전")
    # print(globalVariable.keywordArr)

    return
Пример #18
0
    def summarize_by_ids(cls,
                         ids,
                         opinion_type,
                         ratio=0.3,
                         words=None,
                         keywords_words=None):

        if opinion_type:
            op_list = OpinionService.get_by_ids(ids)
        else:
            op_list = EntradaService.get_by_ids(ids)

        for op in op_list:
            op.resumen = summarizer.summarize(op.content,
                                              language='spanish',
                                              ratio=ratio,
                                              words=words)
            op.keywords = keywords.keywords(op.content,
                                            language='spanish',
                                            split=True,
                                            ratio=ratio,
                                            words=keywords_words)
            if op.keywords == "":
                op.keywords = "Opss!! No ha sido posible extraer palabras claves."

            if op.resumen == "":
                op.resumen = "El texto es breve o no se ha podido generar un resumen. Lo sentimos."

        OpinionService.save_opinions(op_list)

        return op_list
Пример #19
0
def viewSumma(request):
  url_topull = request.GET.get('url', 'https://en.wikipedia.org/wiki/Machine_learning')

  scraped_data = urllib.request.urlopen(url_topull)  
  article = scraped_data.read()

  parsed_article = bs.BeautifulSoup(article,'lxml')

  paragraphs = parsed_article.find_all('p')

  article_text = ""

  for p in paragraphs:  
      article_text += p.text

  summary = summarizer.summarize(article_text,ratio=0.05)

  print("Data pull done")

  print("==================================SUMMARY===================================")
  print (summary)

  print("==================================KEYWORDS===================================")
  print (keywords.keywords(article_text,ratio=0.5))

  context = {
      'title': 'Summa',
      'summary': [summary],
  }
  return render(request, 'home.html', context)
Пример #20
0
def keywordExtractEN(text):
    parsedList = keywords.keywords(text)
    keywordList = parsedList.replace(' ', '').split('\n')
    if (len(parsedList) < 1):
        return "No keywords detected."
    hashtags = map(lambda x: f"#{x}", keywordList)
    return ' '.join(hashtags)
Пример #21
0
    def process_document(self, doc: Document,
                         text_process_params: TextProcessParams):
        doc.keywords.clear()
        doc.entities.clear()

        parse_sentences(doc)
        main_lang = doc.main_lang
        summa_lang = self._land_dic[main_lang]

        # Extract keywords
        text = doc.text
        kw_num = text_process_params.keywords_number
        if kw_num <= 0:
            kw_num = 10
        kwrds = keywords.keywords(text, words=kw_num, language=summa_lang)

        # Extract summary
        ratio = text_process_params.summary_size.calculate_ratio(
            len(doc.sentences))
        summary = summarizer.summarize(text, ratio=ratio, language=summa_lang)
        # print(kwrds)
        # print(summary)
        for kw in kwrds.splitlines():
            doc.keywords.append(Entity(kw, EntityKind.KEYWORD))
        for s in summary.splitlines():
            doc.summary.append(Entity(s, EntityKind.SUMMARY_SENTENCE))
Пример #22
0
    def test_text_summarization_on_short_input_text_is_not_empty_string(self):
        text = get_text_from_test_data("unrelated.txt")

        # Keeps the first 8 sentences to make the text shorter.
        text = "\n".join(text.split('\n')[:8])

        self.assertNotEqual(keywords(text, split=True), "")
Пример #23
0
def textrank_keys(text):
    keywords1 = list()
    for words in TextRank.keywords(text).split('\n'):
        keywords1.extend(words.split(" "))
    keywords1 = set(keywords1)

    return " ".join(list(keywords1))
Пример #24
0
 def test_keywords_few_distinct_words_w_stoplist_split_is_empty_list(self):
     text = get_text_from_test_data("few_distinct_words.txt")
     additional_stopwords = ["here", "there"]
     self.assertEqual(
         keywords(text,
                  split=True,
                  additional_stopwords=additional_stopwords), [])
Пример #25
0
    def test_text_summarization_on_short_input_text_is_not_empty_string(self):
        text = get_text_from_test_data("unrelated.txt")

        # Keeps the first 8 sentences to make the text shorter.
        text = "\n".join(text.split('\n')[:8])

        self.assertNotEqual(keywords(text, split=True), "")
Пример #26
0
def get_textrank_keywords(text, stopwords=[], score_limit=0.1):
    text = preprocess_text(text, stopwords)
    all_keywords = keywords.keywords(text, language="russian", scores=True)
    all_keywords = sorted(all_keywords, key=lambda x: x[1], reverse=True)
    top_keywords = list(filter(lambda x: x[1] >= score_limit, all_keywords))

    return top_keywords
def get_info(path):
    with open(path, 'r') as f:
        text = f.read()
        LANGUAGE = "english"
        SENTENCES_COUNT = 10
        #url = "https://en.wikipedia.org/wiki/Automatic_summarization"
        #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        # or for plain strings
        parser = PlaintextParser.from_string(
            text, Tokenizer(LANGUAGE))  #PlaintextParser.from_file for files
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        print("SUMMARY")
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            print(sentence)

        #print(keywords.keywords(text))                  #original keywords
        keywordlist1 = keywords.keywords(text, split=True)  #list of keywords
        keywordstring = " ".join(keywordlist1)  #string of keywords
        tokens = nltk.word_tokenize(keywordstring)

        print("\n", "KEYWORDS")
        print("\n", {stem(t)
                     for t in tokens}, "\n")  # gives set of unique kywords
Пример #28
0
 def topic2file(self):
     with open(self.topic_path, "w") as out:
         with open(self.article_path) as f:
             for line in f:
                 line_splt = line.split('\t')
                 out.write(
                     keywords.keywords(line_splt[0]).replace("\n", " "))
                 out.write("\n")
Пример #29
0
def generate_summa_keywords(text):
    _keywords = list()
    try:
        _keywords = dict(keywords.keywords(text, scores=True))
    except UnicodeDecodeError:
        print('UnicodeDecodeError: generating keywords')

    return _keywords
Пример #30
0
def detail(request, pk, slug):
    topic = Topic.objects.get_public_or_404(pk, request.user)

    if topic.slug != slug:
        return HttpResponsePermanentRedirect(topic.get_absolute_url())

    utils.topic_viewed(request=request, topic=topic)

    comments = Comment.objects\
        .for_topic(topic=topic)\
        .with_likes(user=request.user)\
        .with_polls(user=request.user)\
        .order_by('date')


    sum_text = ""
    for comment in comments:
        sum_text+= comment.comment


   
    
    doc = nlp(sum_text)
    final=""

    for idno, sentence in enumerate(doc.sents):
        temp = str(sentence).strip()
        final = final + temp + " "
        # print 'Sentence {}:'.format(idno + 1), sentence 
    
    # print(final)
    


    summary = summa.summarizer.summarize(final,ratio=0.3)
    words = keywords.keywords(sum_text)
    words = words.split('\n')
    # print(summary)

    keys = []
    for i in words:
        keys.append(str(i))

    # print(keys)
    comments = paginate(
        comments,
        per_page=config.comments_per_page,
        page_number=request.GET.get('page', 1)
    )

    context = {
        'topic': topic,
        'comments': comments,
        'summary' : summary,
        'keywords' : keys,
    }

    return render(request, 'spirit/topic/detail.html', context)
Пример #31
0
def index_active(request):
    categories = Category.objects\
        .visible()\
        .parents()

    topics = Topic.objects\
        .visible()\
        .global_()\
        .with_bookmarks(user=request.user)\
        .order_by('-is_globally_pinned', '-last_active')\
        .select_related('category')


    summs = []
    for i, topic in enumerate(topics):
        comments = Comment.objects\
            .for_topic(topic=topic)\
            .with_likes(user=request.user)\
            .with_polls(user=request.user)\
            .order_by('date')

        
        sum_text = ""
        for comment in comments:
            sum_text+= comment.comment


        doc = nlp(sum_text)
        final=""

        for idno, sentence in enumerate(doc.sents):
            temp = str(sentence).strip()
            final = final + temp + " "
        

        summary = summa.summarizer.summarize(final,ratio=0.3)
        words = keywords.keywords(sum_text)
        words = words.split('\n')
        summs.append(summary)
        topic.summary = summary
        # print(summary)

    for topic in topics:
        print(topic.summary)

    topics = yt_paginate(
        topics,
        per_page=config.topics_per_page,
        page_number=request.GET.get('page', 1)
    )

    context = {
        'categories': categories,
        'topics': topics,
    }

    return render(request, 'spirit/topic/active.html', context)
def summa_kw(t):
    tokens = mystem.lemmatize(t.lower())
    tokens = [token for token in tokens if token not in russian_sw\
            and token != " " \
            and not token.isdigit() \
            and token.strip() not in punctuation]
    t = str(tokens)
    kw = keywords(t, language='russian', words=5, split=True)
    return kw
Пример #33
0
    def test_keywords_consecutive_keywords(self):
        text = "Rabbit populations known to be plentiful, large, and diverse \
                in the area. \
                Adjacent to the site, a number number well over a thousand. \
                The number of these rabbit populations has diminished in recent \
                years, and perhaps we have become number to a number of their \
                numbers numbering fewer."

        # Should not raise an exception.
        self.assertIsNotNone(keywords(text, words=10))
Пример #34
0
    def test_text_keywords(self):
        text = get_text_from_test_data("mihalcea_tarau.txt")

        # Calculate keywords
        generated_keywords = keywords(text, split=True)

        # To be compared to the reference.
        reference_keywords = get_text_from_test_data("mihalcea_tarau.kw.txt").split("\n")

        self.assertEqual({str(x) for x in generated_keywords}, {str(x) for x in reference_keywords})
Пример #35
0
 def setAbstract(self, line):
     text = line[2:-1]
     if len(text) > 0:
         self.abstract = text.decode('cp866')
         if extractKeywords:
             try:
                 self.keywordsArticle = keywords.keywords(text).split('\n')[0:maxKeywordsFromArticle]
                 return 0
             except Exception as e:
                 print('Exception while extracting keywords from article id: ' +\
                     str(self.index) + ' ' + str(e))
                 print('Abstract: ' + self.abstract)
                 return -1
Пример #36
0
def get_keywords(text, pub):
    numWords = 8
    try:
        keyWords = keywords.keywords(text, words=numWords).split('\n')
        threshold = 3
        # Discard words less than or equal to threshold characters
        words = [s.lower() for s in keyWords if len(s) > threshold]
        finalWords = []
        for word in words:      # Include words with spaces, example sequence alignment
            if " " in word:
                finalWords.append(word)
        words = list(set(words) - set(finalWords))
        pub.keyWords = finalWords + get_unique_words(words, threshold)
    except Exception as e:
        print "Line number " + get_linenumber()
        print e
Пример #37
0
 def test_keywords_few_distinct_words_w_stoplist_split_is_empty_list(self):
     text = get_text_from_test_data("few_distinct_words.txt")
     additional_stopwords = ["here","there"]
     self.assertEqual(keywords(text, split=True, additional_stopwords=additional_stopwords), [])
Пример #38
0
# ['Automatic summarization is the process of reducing a text document with a', 'Document summarization is another.']

## for sentences
from summa.preprocessing.textcleaner import clean_text_by_sentences
res = clean_text_by_sentences(text); print(res, type(res))  # <class 'list'>
# [Original unit: 'Automatic summarization is the process of reducing a text document with a' *-*-*-* Processed unit: 'automat summar process reduc text document', Original unit: 'computer program in order to create a summary that retains the most important points' *-*-*-* Processed unit: 'program order creat summari retain import point', Original unit: 'of the original document.' *-*-*-* Processed unit: 'origin document', Original unit: 'As the problem of information overload has grown, and as' *-*-*-* Processed unit: 'problem inform overload grown', Original unit: 'the quantity of data has increased, so has interest in automatic summarization.' *-*-*-* Processed unit: 'quantiti data increas automat summar', Original unit: 'Technologies that can make a coherent summary take into account variables such as' *-*-*-* Processed unit: 'technolog make coher summari account variabl', Original unit: 'length, writing style and syntax.' *-*-*-* Processed unit: 'length write style syntax', Original unit: 'An example of the use of summarization technology' *-*-*-* Processed unit: 'exampl use summar technolog', Original unit: 'is search engines such as Google.' *-*-*-* Processed unit: 'search engin googl', Original unit: 'Document summarization is another.' *-*-*-* Processed unit: 'document summar']
print([sentence.token for sentence in res])  # ['automat summar process reduc text document', 'program order creat summari retain import point', 'origin document', 'problem inform overload grown', 'quantiti data increas automat summar', 'technolog make coher summari account variabl', 'length write style syntax', 'exampl use summar technolog', 'search engin googl', 'document summar']
sentences = [sentence.text for sentence in res]; print(sentences)  # original text
# ['Automatic summarization is the process of reducing a text document with a', 'computer program in order to create a summary that retains the most important points', 'of the original document.', 'As the problem of information overload has grown, and as', 'the quantity of data has increased, so has interest in automatic summarization.', 'Technologies that can make a coherent summary take into account variables such as', 'length, writing style and syntax.', 'An example of the use of summarization technology', 'is search engines such as Google.', 'Document summarization is another.']
print([sentences.index(sen) for sen in sum_sens])  # [0, 9]
graph = summarizer.get_graph(text)
print(graph.nodes())  # ['automat summar process reduc text document', 'program order creat summari retain import point', 'origin document', 'problem inform overload grown', 'quantiti data increas automat summar', 'technolog make coher summari account variabl', 'length write style syntax', 'exampl use summar technolog', 'search engin googl', 'document summar']

## keywords
from summa import keywords
print(keywords.keywords(text))
# document
# summarization
# writing
# account

##################################################################
## 测试 Byte Cup 2018 数据集
# summarize(text, ratio=0.2, words=None, language='english', split=False, scores=False)
# ratio: Define length of the summary as a proportion of the text (also available in keywords)
# words: Define length of the summary by aproximate number of words (also available in keywords)

from summa import summarizer
text = """london ( ap ) former british celebrity publicist max clifford , the confidante of the stars who found himself swept up in britain 's historic sex abuses scandal , died sunday after collapsing in prison . he was 74.once one of the most powerful figures in the british entertainment world , clifford was convicted in 2014 of eight counts of indecent assault stemming from attacks on teenagers dating back more than 40 years . he was serving an 8-year prison sentence on the sex offenses from the 1970s and 1980s at littlehey prison in cambridgeshire when he died . clifford was once as well known as the celebrities he represented the go-to guy to get an actor out of trouble in a time of crisis . he advised many on damage control and represented tv mogul simon cowell , but he was later convicted of luring young girls into sex by offering them acting roles . clifford had denied all the charges and said his accusers were fantasists . clifford had been arrested as part of an investigation called operation yewtree , a wide-ranging inquiry into alleged sexual offenses spurred by the case of jimmy savile , a well-known british entertainer accused of the sexual abuse of hundreds of girls and women . he died in 2011.the savile allegations shocked britain and caused a major embarrassment for his employer , the bbc , which had been accused of failing to report on investigations into savile 's alleged crimes . before his own sex crimes charges , clifford had told the associated press that he was receiving calls from many celebrities and entertainers worried they would be caught up in the widening savile investigation . " they 're phoning me and saying , 'm ax , i 'm worried that i 'm going to be implicated . ' a lot of them can 't remember what they did last week , never mind 30 or 40 years ago , " he said . clifford 's daughter louise told the mail on sunday that her father had first collapsed thursday in his cell while trying to tidy it . he collapsed again the following day and was taken to a hospital , where suffered a heart attack . she told the newspaper he had been in a " bad way " in a critical care unit . " it was just too much , " she said ."""
print(summarizer.summarize(text))
# london ( ap ) former british celebrity publicist max clifford , the confidante of the stars who found himself swept up in britain 's historic sex abuses scandal , died sunday after collapsing in prison .
Пример #39
0
 def test_text_keywords_wstoplist(self):
     text = get_text_from_test_data("mihalcea_tarau.txt")
     additional_stoplist = get_text_from_test_data("mihalcea_tarau.sw.txt").strip().split(",")
     generated_keywords = keywords(text, split=True, additional_stopwords=additional_stoplist)
     reference_keywords = get_text_from_test_data("mihalcea_tarau.swkw.txt").split("\n")
     self.assertEqual({str(x) for x in generated_keywords}, {str(x) for x in reference_keywords})
Пример #40
0
 def test_keywords_few_distinct_words_is_empty_string(self):
     text = get_text_from_test_data("few_distinct_words.txt")
     self.assertEqual(keywords(text), "")
Пример #41
0
 def test_keywords_few_distinct_words_split_is_empty_list(self):
     text = get_text_from_test_data("few_distinct_words.txt")
     self.assertEqual(keywords(text, split=True), [])
Пример #42
0
 def test_repeated_keywords_wstopwords(self):
     text = get_text_from_test_data("repeated_keywords.txt")
     additional_stoplist = ["sage","user"]
     kwds = keywords(text,additional_stopwords=additional_stoplist)
     self.assertTrue(len(kwds.splitlines()))
Пример #43
0
    def test_repeated_keywords(self):
        text = get_text_from_test_data("repeated_keywords.txt")

        kwds = keywords(text)
        self.assertTrue(len(kwds.splitlines()))
Пример #44
0
 def test_spanish_with_accents(self):
     # Test the keyword extraction with accented characters.
     text = get_text_from_test_data("spanish.txt")
     kwds = keywords(text, language="spanish", deaccent=False, split=True)
     # Verifies that there are some keywords are retrieved with accents.
     self.assertTrue(any(deaccent(keyword) != keyword for keyword in kwds))
Пример #45
0
 def test_keywords_few_distinct_words_wempty_stoplist_is_empty_string(self):
     text = get_text_from_test_data("few_distinct_words.txt")
     self.assertEqual(keywords(text,additional_stopwords=[]), "")