Пример #1
0
def summarize():
    title = request.form['title']
    text = request.form['text']
    text = " ".join(text.replace("\n", " ").split())
    tt = TextTeaser()
    sentences = tt.summarize(title, text, 5)
    return render_template('summarize.html',
                           title=title,
                           text=text,
                           summary=sentences)
Пример #2
0
def summary(event, context):
    tt = TextTeaser()

    # stackAPI_return:
    # concept
    # code
    # title
    # is_code

    data = json.load(open('data.json'))
    pprint(data)
Пример #3
0
def summarize():
    title = request.form.get('title')
    text = request.form.get('text')

    tt = TextTeaser()

    sentences = tt.summarize(title, text)
    summary = {"sentences": []}
    for sentence in sentences:
        print sentence
        summary["sentences"].append(sentence)

    return jsonify(summary)
Пример #4
0
def summarize(user,
              room_name,
              msg_limit=None,
              days_limit=None,
              hours_limit=None,
              min_limit=None,
              title=None):
    if title == None:
        title = '%s Summary' % room_name
    text = compile_messages(user, room_name, msg_limit, days_limit,
                            hours_limit, min_limit)
    tt = TextTeaser()
    return indent_tagged(tt.summarize(title, text),
                         utils.get_emails_with_users(user, room_name).values())
Пример #5
0
def hello():
    url = request.args.get('url', '')
    print(url)

    article = Article(url)
    article.download()
    article.parse()

    title = article.title
    print(title)
    text = article.text
    print(text)

    tt = TextTeaser()

    sentences = tt.summarize(title, text)

    for sentence in sentences:
        print(sentence)

    return jsonify(sentences)
Пример #6
0
def tags():

    

    if request.method == 'POST':

        global session

        if not session:
            session = authfromSwellRT()

        data = request.get_json()
        app.logger.info(data)
        #Initialisation for context
        wave_id = data['waveid']
        description = data['data']['text']
        name = data['data']['name']

        #Generating tags
        tags = json.dumps(mytagger(data['data']['text'],10), default=lambda x: str(x).strip('"\''))

        #Generating summary of 4 lines
        tt = TextTeaser()
        sentences = tt.summarize(name, description)
        summary = json.dumps(sentences[:4])

        
       
        #For logs
        app.logger.info(tags)
        app.logger.info(summary);
        
        post2swellRT(session,wave_id,tags,summary)
        
        return json.dumps(True)
    else:
        tags = json.dumps("Hello from Teem Tag",10, default=lambda x: str(x).strip('"\''))
        return tags
Пример #7
0
def summarize_url(url, arc90=False):
    # arc90 helps us get the content of the article without the comments and shit
    # used in Safari's Reader view, Flipboard, and Treesaver.
    # https://stackoverflow.com/questions/4672060/web-scraping-how-to-identify-main-content-on-a-webpage

    CHAR_LIMIT = 100000  # blocks urls that have too much text that would bog us down
    # TODO: save results so that we avoid querying the same thing again
    # URL's can be PKs

    if not url:
        return

    r = requests.get(url)
    tt = TextTeaser()

    if arc90:
        doc = Document(r.text)
        title = doc.title()
        soup = BeautifulSoup(doc.summary(), "html.parser")
    else:
        soup = BeautifulSoup(r.text, "html.parser")
        title = soup.title.text

    text = ' '.join(map(lambda p: p.text, soup.find_all('p')))

    if len(text) < CHAR_LIMIT:
        summary = ' '.join(tt.summarize(title, text))
    else:
        summary = 'Text exceeds the ' + str(CHAR_LIMIT) + ' character limit.'

    return {
        'title': title,
        'url': url,
        'length': len(text),
        'summary': summary,
        'minutes': len(text.split(' ')) // 200
    }
Пример #8
0
import os
import sys
import json

# textteaser
from textteaser import TextTeaser
tt = TextTeaser()
# gensim
from gensim.summarization.summarizer import summarize
# sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.reduction import ReductionSummarizer
from sumy.summarizers.kl import KLSummarizer
LANGUAGE = "chinese"
SENTENCES_COUNT = 3
stemmer = Stemmer(LANGUAGE)
tokenizer = Tokenizer(LANGUAGE)
# bert-extractive-summarizer
from summarizer import SingleModel
model = SingleModel(model='bert-base-chinese', vector_size=768)
def overload(body, minl=10, maxl=600):
    return body.split('\n')
model.process_content_sentences = overload


import jieba
Пример #9
0
def textteaser_test():

    summary = open("summary_list.txt", "a", encoding='utf-8-sig')
    sys.stdout = summary

    # obtain the input article from url
    #url = "http://www.nytimes.com/2016/11/17/us/politics/donald-trump-administration-twitter.html?ref=politics"
    #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

    # obtain the input article from plain text files
    parser = PlaintextParser.from_file("input_sample.txt", Tokenizer(LANGUAGE))

    # define the language, by dafult it is English
    stemmer = Stemmer(LANGUAGE)

    # SumBasic algorithm
    summarizer = SumBasicSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    print("SumBasic:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    # LSA algorithm
    summarizer = LsaSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    print("Latent Semantic Analysis:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    # TextRank algorithm
    summarizer = TextRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    print("TextRank:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    # LexRank algorithm
    summarizer = LexRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    print("LexRank:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    #Featured-LexRank algorithm
    with open('input_sample.txt', 'r', encoding='utf-8-sig') as f:
        first_line = f.readline()
    title = first_line
    with open('input_sample.txt', 'r', encoding='utf-8-sig') as f:
        text = f.read()
    tt = TextTeaser()

    sentences = tt.summarize(title, text)
    file = open("tt.txt", "w", encoding='utf-8-sig')
    print("Featured-LexRank:")
    for sentence in sentences:
        file.write("%s\n" % sentence)
    file.close()

    parser = PlaintextParser.from_file("tt.txt", Tokenizer(LANGUAGE))
    summarizer = LexRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    summary.close()
Пример #10
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import os
sys.path.append("textteaser")
sys.path.insert(1, "..")
from textteaser import TextTeaser
from textteaser import TextTeaser

# article source: https://blogs.dropbox.com/developers/2015/03/limitations-of-the-get-method-in-http/
title = "山东济南楼市新规:优先满足刚需购房 不得要求全款买房"
text = '''来自山东齐鲁晚报下属的齐鲁壹点网站的消息,4月26日,山东省济南市城乡建设委员会发布《关于进一步规范商品房销售行为的通知》(以下简称“《通知》”),该通知主要针对各房地产开发企业、房产中介及承销机构,意在规范相关方的商品房销售行为。  《通知》要求,销售商品房时,应优先满足无住房记录的刚性购房者需求。不得要求购房人一次性付款或一次性付款优先选房,不得拒绝购房人正常使用住房公积金或商业个人贷款购房,不得要求住宅销售捆绑车位及地下室。  住宅项目申请商品房预售许可证时,应提交销售方案。销售方案包括:房源信息、销售方式、付款方式、意向购房者组成(30%首付、60%首付、全款客户占比情况)。销售方案审批通过后,向社会公示。  商品住宅项目形象进度满足预售要求的,应当一次性申请预售。  在取得《商品房预售许可证》后,应本着公开、公平、公正的原则对外销售。一次性公开全部准售房源,公示销售进度控制表,在销售现场醒目位置明码标价,并告知所有购房者。  对于违反规定的相关房地产开发企业,将依法责令立即整改,拒不整改的,依法予以行政处罚,记入房地产开发企业信用档案,向社会公示。整改完成前,暂停项目合同网签及后续预售审批项目的办理。  《通知》发布的背景则为,近期,济南市城乡建设委员会接到多份来自“12345”市民热线转办及市民群众来电来信,反映济南市部分热点区域住宅项目存在全款购房、全款优先选房、拒绝使用商业贷款或个人公积金贷款等歧视刚性需求购房者,以及住宅销售捆绑车位、地下室销售等行为,这些行为严重扰乱了房地产市场秩序,造成了极其恶劣的社会影响。  此前中国山东网曾报道,被国家明令叫停的设置购房门槛的情况又在济南出现。为此,济南市住建委,住建委的工作人员向中国山东网明确表示,选择全款购买还是贷款购买是购房人的基本权利,开发商不得刻意设置购房门槛限制购买,更不允许以捆绑地下室或者捆绑车位的形式进行销售,此类行为一经查处,济南市住建委将对该楼盘进行包括吊销预售证,拉入诚信黑名单等一系列处罚,维护济南房地产市场的平稳。'''

stopWordsPath = os.path.dirname(
    os.path.abspath(__file__)) + '/textteaser/trainer/stopWords.txt'
tt = TextTeaser(stopWordsPath, text)

sentences = tt.summarize(title, text)

for sentence in sentences:
    print(sentence)
Пример #11
0
 def generate_summary_textteaser(self, input_text):
     tt = TextTeaser('TextTeaserApiTest')
     return tt.summarize(text=input_text, title='Test', url=None)