示例#1
0
def wiki_summary(wikipage,
                 toc,
                 relevant_toc,
                 mode='multi',
                 wikisec=True,
                 word_count=150):
    wikisum = wikipage.summary
    wikisum_len = len(wikisum.split())
    if toc:  #if non-empty table of contents
        sum_fields = list(set(toc).difference(set(relevant_toc)))
        if mode == 'multi':
            sum_sum = summarize_wiki_page(wikipage,
                                          sum_fields,
                                          wikisec=wikisec)
            sum_total = wikisum + sum_sum
        elif mode == 'single':
            text = []
            subset_dict = grab_wikisec_toc(wikipage, sum_fields)
            for key, content in subset_dict.items():
                text.append(content)
            try:
                sum_sum = wikisum + ''.join(text)
            except TypeError:
                sum_sum = wikisum
            sum_sum_len = len(sum_sum.split())
            if sum_sum_len < word_count:
                sum_total = sum_sum
            else:
                sum_total = summarize(sum_sum, word_count=word_count)
    else:
        if wikisum_len < word_count:
            sum_total = wikisum
        else:
            sum_total = summarize(wikisum, word_count=word_count)
    return sum_total
示例#2
0
def text_splitter(text):
    k = keywords(text, words=8, lemmatize=True).split('\n')
    kwords = ', '.join(k)
    text += '. --END'
    print(text)
    print(summarize(text, 0.3))
    return (summarize(text, 0.3), kwords)
示例#3
0
def frontpage():
    if request.method == 'GET':
        return render_template('homepage.html')
    elif request.method == 'POST':
        if request.form['paragraph_text']:
            text = request.form['paragraph_text']
            with open('run/src/static/ori_text.txt', 'w+') as f:
                f.write(text)
            try:
                summ1 = summarization(text)
                with open('run/src/static/textfile1.txt', 'w+') as f:
                    f.write(summ1)
                summ2 = summarize(text)
                with open('run/src/static/textfile2.txt', 'w+') as f:
                    f.write(summ2)
                return redirect('/result')
            except ValueError:
                return render_template('homepage.html')
        elif request.files['fileselect']:
            text_file = request.files['fileselect']
            filename = secure_filename(text_file.filename)
            text_file.save(os.path.join("run/src/static/ori_text.txt"))
            with open('run/src/static/ori_text.txt', 'r') as f:
                content = f.read()
            summ3 = summarization(content)
            with open('run/src/static/textfile1.txt', 'w+') as f:
                f.write(summ3)
            summ4 = summarize(content)
            with open('run/src/static/textfile2.txt', 'w+') as f:
                f.write(summ4)
            return redirect('/result')
        else:
            return render_template('homepage.html')
示例#4
0
def textlize(request):
    result = ''
    form = TextInputForm(request.POST)
    if request.method == 'POST':
        if form.is_valid():
            raw_text = form.cleaned_data["text_in"]
            cleaned_text = get_only_text(raw_text)
            selected = form.cleaned_data.get('slize_size')
            if selected:
                size_dict = {
                    "slize_pointfive": 0.005,
                    "slize_one": 0.01,
                    "slize_five": 0.05,
                    "slize_ten": 0.10,
                    "slize_twenty": 0.20,
                    "slize_thirty": 0.30,
                    "slize_forty": 0.40,
                    "slize_fifty": 0.50
                }
                for k, v in size_dict.items():
                    if selected == k:
                        result = summarize(cleaned_text, ratio=v)
                        return render(request, 'text/textlize.html', {
                            'form': form,
                            'result': result
                        })
            else:
                result = summarize(cleaned_text, ratio=0.25)
                return render(request, 'text/textlize.html', {
                    'form': form,
                    'result': result
                })
    else:
        form = TextInputForm()
    return render(request, 'text/textlize.html', {'form': form})
def find_keywords(filepaths):
    print("Finding Keywords...")
    file_keywords = []
    files = []
    documents = []
    for fp in filepaths:
        ext = os.path.splitext(fp)[-1].lower()
        if ext == ".pdf":
            # keywords_set = clean_keywords(read_pdf(fp))
            text = ''
            pdfFileObj = open(fp, 'rb')
            pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
            num = pdfReader.numPages
            for i in range(num):
                pageObj = pdfReader.getPage(i)
                text += pageObj.extractText()
            pdfFileObj.close()
            if text != '':
                document = summarize(text)
            else:
                meta_path = os.path.dirname(fp) + '\metadata.csv'
                des = pd.read_csv(meta_path, encoding='unicode_escape')
                try:
                    description = des['Description'][0]
                except:
                    description = des['Title'][0]
            document = description
            keywords_set = clean_keywords(keywords_from_summary(document))
            files.append(fp)
        elif ext == '.rtf':
            files.append(fp)
            with open(fp, 'r') as file:
                text = file.read()
                document_t = rtf_to_text(text).replace('\n',
                                                       ' ').replace('\t', ' ')
                keywords_set = clean_keywords(
                    keywords_from_summary(summarize(document_t)))
                document = document_t
        elif ext == '.docx':
            text = getText(fp)
            document = text
            keywords_set = clean_keywords(
                keywords_from_summary(summarize(text)))
            files.append(fp)
        else:
            files.append(fp)
            meta_path = os.path.dirname(fp) + '\metadata.csv'
            des = pd.read_csv(meta_path, encoding='unicode_escape')
            try:
                description = des['Description'][0]
            except:
                description = des['Title'][0]
            document = description
            keywords_set = clean_keywords(keywords_from_summary(description))
        file_keywords.append(keywords_set)
        documents.append(document)
    # print(documents)
    return file_keywords, documents
示例#6
0
def get_summary(url: str) -> str:
    if 'wikipedia' in urllib.parse.urlparse(url).netloc:
        sentences = get_wiki_text(url)
    else:
        sentences = get_article_text(url)
    if len(sentences.split()) > 2000:
        return summarize(sentences, word_count=1000)
    else:
        return summarize(sentences, ratio=0.5)
示例#7
0
def bm25(input_seq):
    inp = ''.join(input_seq)
    ten = summarize(inp, ratio=0.1, split=True)
    thirty = summarize(inp, ratio=0.3, split=True)
    forty = summarize(inp, ratio=0.4, split=True)
    fifty = summarize(inp, ratio=0.5, split=True)
    
    return ten, thirty, forty, fifty
    
# ============================================================================
    def reumir(self, text):
        res = summarize(text, ratio=0.1)

        print res

        res2 = summarize(text, word_count=100)

        print res2

        print(keywords(ratio=0.1))
示例#9
0
def _doc_summarizer_eng(table,
                        input_col,
                        hold_cols=None,
                        result_type='summarized_document',
                        new_col_name='summarized_document',
                        ratio=None,
                        num_sentence=1):

    doc_col = table[input_col].values
    len_doc_col = len(doc_col)

    if hold_cols is None:
        out_table = table.copy()
    else:
        out_table = table[hold_cols]

    table_list = []
    for i in range(len_doc_col):
        try:
            len_doc = len(summarize(doc_col[i], ratio=1, split=True))
        except ValueError as e:
            if str(e) == "input must have more than one sentence":
                summarized_doc = doc_col[i]
                summarized_sents = [doc_col[i]]
                _num_sentence = 1
            else:
                raise
        else:
            if ratio is not None:
                _num_sentence = np.maximum(int(len_doc * ratio), 1)
                _ratio = ratio
            else:
                _num_sentence = np.minimum(len_doc, num_sentence)
                _ratio = (_num_sentence / len_doc if len_doc != 0 else 1)
            summarized_doc = summarize(doc_col[i], ratio=_ratio, split=False)
            summarized_sents = summarize(doc_col[i], ratio=_ratio, split=True)

        if result_type == 'summarized_document':
            summarized_col = [summarized_doc]
        else:
            summarized_col = np.insert(np.transpose(
                [summarized_sents[0:_num_sentence]]),
                                       0,
                                       i + 1,
                                       axis=1)
        table_list.append(summarized_col)
    result_table = np.concatenate(table_list, axis=0)

    if result_type == 'summarized_document':
        out_table[new_col_name] = result_table
    else:
        out_table = pd.DataFrame(result_table, columns=['doc_id', 'sentence'])
        out_table['doc_id'] = out_table['doc_id'].astype(int)

    return {'out_table': out_table}
示例#10
0
def get_speech():
    global out
    driver.get("https://www.moneycontrol.com/annual-report/" +
               company_name+"/directors-report/"+cd+"#"+cd)
    
    # director_speech
    director_speech = driver.find_element_by_xpath(
        '//div[@class="report_data"]').text
    director_speech
    d = re.match('.*\\n', director_speech).group()
    # ds = director_speech.rstrip("\n")
    # ds=re.sub('\n',' ',director_speech)
    ds = re.sub(d, ' ', director_speech)

    driver.get("https://www.moneycontrol.com/annual-report/" +
               company_name+"/chairmans-speech/"+cd+"#"+cd)
    chairman_speech = driver.find_element_by_xpath(
        '//div[@class="report_data"]').text
    c = re.match('.*\\n', chairman_speech).group()
    # cs=re.sub('\n',' ',chairman_speech)
    cs = re.sub(c, " ", chairman_speech)

    ds_keyword_list = keywords(ds, words=20, split=True, lemmatize=True)
    cs_keyword_list = keywords(cs, words=20, split=True, lemmatize=True)
    # keywords from whole chairman's speech
    # print(keyword_list)

    ds_keyword_tags = dict(nltk.pos_tag(ds_keyword_list))
    cs_keyword_tags = dict(nltk.pos_tag(cs_keyword_list))

    ds_keywords_final = [
        word for word in ds_keyword_tags.keys() if ds_keyword_tags[word] == 'NN']
    cs_keywords_final = [
        word for word in cs_keyword_tags.keys() if cs_keyword_tags[word] == 'NN']
    # also need to remove company name if there in the list

    # print(keywords_final[:5])
    # summarization
    ds_summ = summarize(ds, word_count=100)
    cs_summ = summarize(cs, word_count=100)

    driver.get("https://www.moneycontrol.com/annual-report/"+company_name+"/directors-report/"+cd+"#"+cd)
    time.sleep(3)
    page = driver.page_source
    driver.quit()
    soup = BeautifulSoup(page, 'html.parser')
    container = soup.find_all('div', attrs={
    'class':'report_data'})
    soup_string = str(container)


    out = json.dumps([{'summary': cs_summ, 'keywords': cs_keywords_final[:5], 'fullCont':cs}, {
                     'summary': ds_summ, 'keywords': ds_keywords_final[:5], 'fullCont':soup_string}])
    return out
示例#11
0
def get_summary(text):
    num_of_words = len(text.split())
    print('[info] total size: ' + str(num_of_words))

    if num_of_words >= 5000:
        return (summarize(text, 0.05))
    elif num_of_words >= 3000 and num_of_words < 5000:
        return (summarize(text, 0.1))
    elif num_of_words >= 1000 and num_of_words < 3000:
        return (summarize(text, 0.2))
    else:
        return (summarize(text, 0.3))
示例#12
0
def summarizeText(text):
    """summarize text if needed"""
    text_summarized = []
    for txt in text:
        try:
            if (summarize(txt) == ''):
                text_summarized.append(txt)
            else:
                text_summarized.append(summarize(txt))
        except:
            text_summarized.append(txt)

    return text_summarized
def create_summarized_feature(x):
    str_local = ""
    try :
            if len(x.split()) > 200:
                str_local = summarize(x, word_count = 200)
            else:
                str_local = x
                
    except ValueError:
        str_local_Error = ". ".join(rake_implement(x))
        str_local = summarize(str_local_Error, word_count = 200)
        print("Can't Summarize this sentence as input has only one sentence. Hence, replacing with (Rake + Summarized Value)" )
    return str_local
示例#14
0
文件: text.py 项目: eddings/cord19-1
def summarize(text, word_count=120):
    if num_sentences(text) > 1:
        try:
            word_count_summary = summarizer.summarize(text,
                                                      word_count=word_count)
        except ValueError:
            return text
        if word_count_summary:
            return word_count_summary
        else:
            ratio_summary = summarizer.summarize(text, ratio=0.2)
            if ratio_summary:
                return ratio_summary
    return text
示例#15
0
文件: app.py 项目: johnteja/nlp
def main():
    """ NLP APP WITH STREAMLIT"""
    st.title("NLP with streamlit")
    st.subheader("Natural Language Processing on the Go")

    # tokenization
    if st.checkbox("show tokens and lemma", False):
        st.markdown("Tokenize your Text")
        message = st.text_area("Enter your text", "Type Here")
        if st.button("Analyze"):
            nlp_result = text_analyzer(message)
            st.json(nlp_result)

    # named entity
    if st.checkbox("show Named Entities", False):
        st.markdown("Extract entities from your Text")
        message = st.text_area("Enter ur text", "Type Here")
        if st.button("Extract"):
            nlp_result = entity_analyzer(message)
            st.json(nlp_result)
    # sentiment analysis
    if st.checkbox("show Sentiment Analysis", False):
        st.markdown("Sentiment of your Text")
        message = st.text_area("Enter text", "Type Here")
        if st.button("Analyze"):
            blob = TextBlob(message)
            result_sentiment = blob.sentiment
            st.success(result_sentiment)

    # text summarization
    if st.checkbox("show Text Summarization", False):
        st.markdown("Summarize of your Text")
        message = st.text_area("Enter text", "Type Here")
        summary_options = st.selectbox("Choose your summarizer",
                                       ("gensim", "sumy"))
        if st.button("Summarize"):
            if summary_options == 'sumy':
                st.text("Using sumy...")
                summary_result = sumy_summarizer(message)
            elif summary_options == 'gensim':
                st.text("Using gensim summarizer")
                summary_result = summarize(message)
            else:
                st.warning("Using default summarizer")
                st.text("Using Gensim")
                summary_result = summarize(message)

            st.success(summary_result)
示例#16
0
def summarize_text(text, ratio=0.05):
    sentences = sent_tokenize(text)
    sentences = list(filter(lambda x: len(word_tokenize(x)) < 50, sentences))
    text = '\n'.join(sentences)
    if len(sentences) > 1:
        return ' '.join(summarize(text, ratio=ratio, split=True))
    return ""
示例#17
0
def summary_summarize_task(self, pk):
    ''' retrive csv file from s3.
        read into datframe.
        get summaization of document
        add summary to list
        add list to dictionary
    '''
    progress_recorder = ProgressRecorder(
        self)  # create progress recorder object
    doc = Summary_Documents.objects.get(
        pk=pk)  # get the document ref from the database
    documentName = str(doc.document)  # get the name of the doc
    aws_id = os.environ.get('AWS_ACCESS_KEY_ID')  # aws access
    aws_secret = os.environ.get('AWS_SECRET_ACCESS_KEY')  #aws access
    REGION = 'eu-west-1'
    client = boto3.client(
        's3',
        region_name=REGION,
        aws_access_key_id=aws_id,
        aws_secret_access_key=aws_secret
    )  # create the client to retrieve the file from storage
    bucket_name = "doc-sort-file-upload"
    object_key = documentName
    csv_obj = client.get_object(Bucket=bucket_name, Key=object_key)
    body = csv_obj['Body']
    csv_string = body.read().decode('utf-8')
    data = pd.read_csv(StringIO(csv_string))  # read csv into dataframe
    documents = data['content']
    docs_summarized = 0
    docs_not_summarized = 0
    total_docs = 0
    documents_len = []
    summary_len = []
    result = []  # new column to hold result integer (0,1,2)value
    count = 0
    for doc in documents:  # iterate through filtered list
        documents_len.append(len(doc))
        summary = summarize(doc, ratio=0.03)  # get summary
        result.append(summary)
        summary_len.append(len(summary))
        total_docs += 1
        if result == None:
            result.append("Document too short")
            docs_not_summarized += 1
        docs_summarized += 1
        progress_recorder.set_progress(count + 1,
                                       len(documents))  # update progress
        count += 1  # update count

    summary_dict = {}

    # Adding list as value
    summary_dict["Result"] = result
    summary_dict["Total_docs"] = total_docs
    summary_dict["Docs_summarized"] = docs_summarized
    summary_dict["Docs_not_summarized"] = docs_not_summarized
    summary_dict["Documents_len"] = documents_len
    summary_dict["Summary_len"] = summary_len

    return summary_dict
示例#18
0
def get_summary(url, ratio):

    news = Article(url, language='ko')
    news.download()
    news.parse()

    return summarize(news.text, ratio=ratio)
示例#19
0
def endpoint(event, context):
    body = json.loads(event['body'])
    text = body['text']

    # Strip out questions
    #new_text = [x for x in text.split('\n') if x[-1] != '?']

    # Remove single words
    #new_text = '\n'.join([x for x in new_text if len(x.split()) > 1])

    # Summarize
    summary = summarize(text, .5)

    # Strip out questions
    summary = [x for x in summary.split('\n') if x[-1] != '?']

    # Remove single words
    summary = '\n'.join([x for x in summary if len(x.split()) > 1])

    response = {
        "statusCode": 200,
        "headers": {
            'Access-Control-Allow-Origin': '*',
            'Access-Control-Allow-Credentials': True,
        },
        "body": json.dumps(summary)
    }

    return response
示例#20
0
def gensim_summarize():
    from gensim.summarization.summarizer import summarize
    data = dict(default_data)
    data['message'] = "Summarize long text - Usage: 'text' POST parameter"
    params = {}

    if request.method == 'GET':
        return jsonify(data)

    params = request.form  # postdata

    if not params:
        data['error'] = 'Missing parameters'
        return jsonify(data)

    if not 'text' in params:
        data['error'] = '[text] parameter not found'
        return jsonify(data)

    if not 'word_count' in params:
        word_count = None
    else:
        word_count = int(params['word_count'])

    data['summarize'] = summarize(text=params['text'], word_count=word_count)

    return jsonify(data)
示例#21
0
    def text2sentences(self, text):
        #text 요약하기

        #print(summarize(text, ratio=0.3))
        sentences1 = summarize(text, ratio=0.1)

        return self.kkma.sentences(sentences1)
    def scrape_news_article(self, response):
        if 'html' in response.url:
            headline = response.css('h3 > span.titletext::text').extract()
            date_publish = response.css('time::text').extract()
            date_publish = date_publish[0]
            article_text = response.css('.textblock p::text').extract()
            article_text = ''.join(article_text)
            hot_words = self.get_hotwords(article_text)
            top_key_words = [(kw[0] + ', ')
                             for kw in Counter(hot_words).most_common(7)]
            keywords = ''.join(top_key_words)
            author = ''
            subject = response.url.split('/')[3]
            summary = summarize(article_text)
            link = response.url

            headline = list(map(lambda x: x.strip(), headline))

            articleItem = NewsScrapingItem(headline=headline,
                                           date_publish=date_publish,
                                           article_text=article_text,
                                           author=author,
                                           subject=subject,
                                           keywords=keywords,
                                           summary=summary,
                                           link=link)

            yield articleItem
示例#23
0
def csummary(bot, update, args):
    url_link = ' '.join(args[1:])
    title, raw_content = extract_article(url_link)
    if (title == "0" and raw_content == "0"):
        bot.send_message(chat_id=update.message.chat_id,
                         text="Unable to access the URL")
    elif (raw_content == "0"):
        bot.send_message(chat_id=update.message.chat_id,
                         text="No content is read")
    elif (raw_content == None):
        bot.send_message(
            chat_id=update.message.chat_id,
            text="Unable to retrive content: Perhaps it's too long")
    else:
        summary = summarize(raw_content, 0.5, int(args[0]))
        analysed_text = "Title: " + title + "\nSummary:\n" + summary
        original_len = count_words(raw_content)
        summarised_len = count_words(summary)
        if (summarised_len >= original_len):
            bot.send_message(
                chat_id=update.message.chat_id,
                text=
                "Fail to summarize: Perhaps your content is too short or using too big chunk sentence(s)"
            )
        else:
            #prevent overflooding
            #text_splitter(bot, update, analysed_text)
            bot.send_message(chat_id=update.message.chat_id,
                             text=analysed_text)
            data_text = "\nOriginal Length: " + str(
                original_len) + " words\nSummary length: " + str(
                    summarised_len
                ) + " words\nProportion from original length:" + str(
                    summarised_len / original_len)
            bot.send_message(chat_id=update.message.chat_id, text=data_text)
def bbc_dataset_rouge():
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'],
                                      use_stemmer=True)
    test_data = get_bbc_dataset_files()
    generic_baseline, generic_improved, generic_improved_redundancy_removal = None, None, None
    for v in test_data:
        text = v['full']
        summary = v['summary']
        scores = scorer.score(summarize(text, ratio=0.4), summary)
        generic_baseline = update_score(generic_baseline, scores)
        scores = scorer.score(
            improved_summarize(text, ratio=0.4, redundancy_removal=False),
            summary)
        generic_improved = update_score(generic_improved, scores)
        scores = scorer.score(
            improved_summarize(text, ratio=0.4, redundancy_removal=True),
            summary)
        generic_improved_redundancy_removal = update_score(
            generic_improved_redundancy_removal, scores)
    total_news = len(test_data)
    return {
        'generic_baseline':
        get_score_avg(generic_baseline, total_news),
        'generic_improved_redundancy_removal':
        get_score_avg(generic_improved_redundancy_removal, total_news),
        'generic_improved':
        get_score_avg(generic_improved, total_news)
    }
示例#25
0
def index(request):
    text = ""
    summarized_text = ""
    message = ""
    if request.method == 'POST':
        form = ImageUpload(request.POST, request.FILES)
        if form.is_valid():
            try:
                form.save()
                image = request.FILES['image']
                image = image.name
                path = settings.MEDIA_ROOT
                pathz = path + "/images/" + image

                text = pytesseract.image_to_string(Image.open(pathz))
                text = text.encode("ascii", "ignore")
                text = text.decode()

                # Summary (0.1% of the original content).
                summarized_text = summarize(text, ratio=0.1)
                os.remove(pathz)
            except:
                message = "check your filename and ensure it doesn't have any space or check if it has any text"

    context = {
        'text': text,
        'summarized_text': summarized_text,
        'message': message
    }
    return render(request, 'formpage.html', context)
示例#26
0
def summarise(data, context):
    print(context)
    print(data.textPayload)
    event = json.loads(data.textPayload)
    bucket_name = event.bucket
    file_name = event.name
    if "speech-to-text.txt" not in file_name:
        return

    storage_client = storage.Client()

    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.get_blob(file_name)
    text = blob.download_as_string()

    body = summarize(text, 0.5)

    filepath_parts = file_name.split('/')
    summary_file = "/"
    summary_file.join(filepath_parts[0:len(filepath_parts - 2)])
    blob = bucket.blob(summary_file)
    blob.upload_from_string(
        body,
        content_type="text/plain"
    )

    response = {
        "statusCode": 200,
        "body": body
    }

    return json.dumps(response, indent=4)
def gensim_summarize(text, ratio=0.2, word_count=None):
    """
    Input: a paragraph as text string, proportion of sentences as summary, or limit words as summary
    Output a summary as text string
    """

    return summarize(text, ratio, word_count)
    def parse_url(self, url):

        from bs4 import BeautifulSoup
        from gensim.summarization.summarizer import summarize
        import requests

        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
        }
        summaries = []

        # URL cleaning
        if url is not None:
            url = url.strip('\'"')

        try:

            page = requests.get(url, headers=headers, stream=True)

            soup = BeautifulSoup(page.content, "lxml")
            # print ('got soup')
            text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
            # print ('got text')
            text_summary = summarize(text)
            # print ('got summary')
        except:
            text_summary = ''

        summaries.append(text_summary)

        return summaries
def SKMN(input1, sentcount):
    summarizer = KeysentenceSummarizer(tokenize=komoran_tokenizer, similarity='textrank', min_sim=0.3)
    tr = TextRank()
    tr.loadSents(RawSentence(input1), lambda sent: filter(lambda x: x not in stop_words, preprocessing(input1)))
    tr.build()
    sents = sent_tokenize(input1)
    keysent1 = summarizer.summarize(sents, topk=sentcount)
    keysent2 = sent_tokenize(summarize(input1, ratio=0.1))
    keysent3 = sent_tokenize(tr.summarize(sentcount))
    keysent4 = summarizer.summarize(sents, topk=sentcount * 2)
    Mixlist = mixlist(mixlist(mixlist(keysent1, keysent2), keysent3), keysent4)
    mixlen = len(Mixlist)
    countlist = [0] * mixlen
    countlist = mcountlist(Mixlist, keysent1, countlist)
    countlist = mcountlist(Mixlist, keysent2, countlist)
    countlist = mcountlist(Mixlist, keysent3, countlist)
    countlist = mcountlist(Mixlist, keysent4, countlist)
    maxnum = max(countlist)
    rcountlist = [0] * mixlen
    for i in range(mixlen):
        rcountlist[i] = maxnum - countlist[i] + 1
    from queue import PriorityQueue
    que = PriorityQueue()
    for i in range(mixlen):
        que.put((rcountlist[i], Mixlist[i]))
    return '\n'.join([que.get()[1] for _ in range(sentcount)])
示例#30
0
    def scrape(self, response):
        articleItem = response.meta['item']
        articleItem['headline'] = response.css('h3::text').extract()
        articleItem['date_publish'] = response.css(
            "time::attr('title')").extract()
        articleItem['article_text'] = response.css(
            '.css-1jftgse p::text').extract()
        article_text = ''.join(articleItem['article_text'])
        articleItem['author'] = response.css(
            ".css-134vnn1 section:nth-child(3) li a span::text").extract()
        articleItem['author'] = ','.join(articleItem['author'])
        txt_blob = TextBlob(article_text)
        articleItem['sentiment'] = txt_blob.sentiment
        key_words = self.get_hotwords(article_text)
        top_key_words = [(kw[0] + ', ')
                         for kw in Counter(key_words).most_common(7)]
        articleItem['keywords'] = ''.join(top_key_words)
        articleItem['summary'] = summarize(article_text, ratio=0.2)
        articleItem['link'] = response.url

        if 'Sekundäre Navigation' in articleItem['headline']:
            articleItem['headline'].remove('Sekundäre Navigation')

        for i in articleItem['date_publish']:
            pattern = re.compile(r'\d{2}.\d{2}.\d{4}')
            result = re.search(pattern, i)
            articleItem['date_publish'] = result.group()

        yield articleItem