def summarize(): title = request.form['title'] text = request.form['text'] text = " ".join(text.replace("\n", " ").split()) tt = TextTeaser() sentences = tt.summarize(title, text, 5) return render_template('summarize.html', title=title, text=text, summary=sentences)
def summary(event, context): tt = TextTeaser() # stackAPI_return: # concept # code # title # is_code data = json.load(open('data.json')) pprint(data)
def summarize(): title = request.form.get('title') text = request.form.get('text') tt = TextTeaser() sentences = tt.summarize(title, text) summary = {"sentences": []} for sentence in sentences: print sentence summary["sentences"].append(sentence) return jsonify(summary)
def summarize(user, room_name, msg_limit=None, days_limit=None, hours_limit=None, min_limit=None, title=None): if title == None: title = '%s Summary' % room_name text = compile_messages(user, room_name, msg_limit, days_limit, hours_limit, min_limit) tt = TextTeaser() return indent_tagged(tt.summarize(title, text), utils.get_emails_with_users(user, room_name).values())
def hello(): url = request.args.get('url', '') print(url) article = Article(url) article.download() article.parse() title = article.title print(title) text = article.text print(text) tt = TextTeaser() sentences = tt.summarize(title, text) for sentence in sentences: print(sentence) return jsonify(sentences)
def tags(): if request.method == 'POST': global session if not session: session = authfromSwellRT() data = request.get_json() app.logger.info(data) #Initialisation for context wave_id = data['waveid'] description = data['data']['text'] name = data['data']['name'] #Generating tags tags = json.dumps(mytagger(data['data']['text'],10), default=lambda x: str(x).strip('"\'')) #Generating summary of 4 lines tt = TextTeaser() sentences = tt.summarize(name, description) summary = json.dumps(sentences[:4]) #For logs app.logger.info(tags) app.logger.info(summary); post2swellRT(session,wave_id,tags,summary) return json.dumps(True) else: tags = json.dumps("Hello from Teem Tag",10, default=lambda x: str(x).strip('"\'')) return tags
def summarize_url(url, arc90=False): # arc90 helps us get the content of the article without the comments and shit # used in Safari's Reader view, Flipboard, and Treesaver. # https://stackoverflow.com/questions/4672060/web-scraping-how-to-identify-main-content-on-a-webpage CHAR_LIMIT = 100000 # blocks urls that have too much text that would bog us down # TODO: save results so that we avoid querying the same thing again # URL's can be PKs if not url: return r = requests.get(url) tt = TextTeaser() if arc90: doc = Document(r.text) title = doc.title() soup = BeautifulSoup(doc.summary(), "html.parser") else: soup = BeautifulSoup(r.text, "html.parser") title = soup.title.text text = ' '.join(map(lambda p: p.text, soup.find_all('p'))) if len(text) < CHAR_LIMIT: summary = ' '.join(tt.summarize(title, text)) else: summary = 'Text exceeds the ' + str(CHAR_LIMIT) + ' character limit.' return { 'title': title, 'url': url, 'length': len(text), 'summary': summary, 'minutes': len(text.split(' ')) // 200 }
import os import sys import json # textteaser from textteaser import TextTeaser tt = TextTeaser() # gensim from gensim.summarization.summarizer import summarize # sumy from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.nlp.stemmers import Stemmer from sumy.summarizers.lsa import LsaSummarizer from sumy.summarizers.text_rank import TextRankSummarizer from sumy.summarizers.lex_rank import LexRankSummarizer from sumy.summarizers.reduction import ReductionSummarizer from sumy.summarizers.kl import KLSummarizer LANGUAGE = "chinese" SENTENCES_COUNT = 3 stemmer = Stemmer(LANGUAGE) tokenizer = Tokenizer(LANGUAGE) # bert-extractive-summarizer from summarizer import SingleModel model = SingleModel(model='bert-base-chinese', vector_size=768) def overload(body, minl=10, maxl=600): return body.split('\n') model.process_content_sentences = overload import jieba
def textteaser_test(): summary = open("summary_list.txt", "a", encoding='utf-8-sig') sys.stdout = summary # obtain the input article from url #url = "http://www.nytimes.com/2016/11/17/us/politics/donald-trump-administration-twitter.html?ref=politics" #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # obtain the input article from plain text files parser = PlaintextParser.from_file("input_sample.txt", Tokenizer(LANGUAGE)) # define the language, by dafult it is English stemmer = Stemmer(LANGUAGE) # SumBasic algorithm summarizer = SumBasicSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print("SumBasic:") for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("\n") # LSA algorithm summarizer = LsaSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print("Latent Semantic Analysis:") for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("\n") # TextRank algorithm summarizer = TextRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print("TextRank:") for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("\n") # LexRank algorithm summarizer = LexRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print("LexRank:") for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("\n") #Featured-LexRank algorithm with open('input_sample.txt', 'r', encoding='utf-8-sig') as f: first_line = f.readline() title = first_line with open('input_sample.txt', 'r', encoding='utf-8-sig') as f: text = f.read() tt = TextTeaser() sentences = tt.summarize(title, text) file = open("tt.txt", "w", encoding='utf-8-sig') print("Featured-LexRank:") for sentence in sentences: file.write("%s\n" % sentence) file.close() parser = PlaintextParser.from_file("tt.txt", Tokenizer(LANGUAGE)) summarizer = LexRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("\n") summary.close()
#!/usr/bin/python # -*- coding: utf-8 -*- import sys import os sys.path.append("textteaser") sys.path.insert(1, "..") from textteaser import TextTeaser from textteaser import TextTeaser # article source: https://blogs.dropbox.com/developers/2015/03/limitations-of-the-get-method-in-http/ title = "山东济南楼市新规:优先满足刚需购房 不得要求全款买房" text = '''来自山东齐鲁晚报下属的齐鲁壹点网站的消息,4月26日,山东省济南市城乡建设委员会发布《关于进一步规范商品房销售行为的通知》(以下简称“《通知》”),该通知主要针对各房地产开发企业、房产中介及承销机构,意在规范相关方的商品房销售行为。 《通知》要求,销售商品房时,应优先满足无住房记录的刚性购房者需求。不得要求购房人一次性付款或一次性付款优先选房,不得拒绝购房人正常使用住房公积金或商业个人贷款购房,不得要求住宅销售捆绑车位及地下室。 住宅项目申请商品房预售许可证时,应提交销售方案。销售方案包括:房源信息、销售方式、付款方式、意向购房者组成(30%首付、60%首付、全款客户占比情况)。销售方案审批通过后,向社会公示。 商品住宅项目形象进度满足预售要求的,应当一次性申请预售。 在取得《商品房预售许可证》后,应本着公开、公平、公正的原则对外销售。一次性公开全部准售房源,公示销售进度控制表,在销售现场醒目位置明码标价,并告知所有购房者。 对于违反规定的相关房地产开发企业,将依法责令立即整改,拒不整改的,依法予以行政处罚,记入房地产开发企业信用档案,向社会公示。整改完成前,暂停项目合同网签及后续预售审批项目的办理。 《通知》发布的背景则为,近期,济南市城乡建设委员会接到多份来自“12345”市民热线转办及市民群众来电来信,反映济南市部分热点区域住宅项目存在全款购房、全款优先选房、拒绝使用商业贷款或个人公积金贷款等歧视刚性需求购房者,以及住宅销售捆绑车位、地下室销售等行为,这些行为严重扰乱了房地产市场秩序,造成了极其恶劣的社会影响。 此前中国山东网曾报道,被国家明令叫停的设置购房门槛的情况又在济南出现。为此,济南市住建委,住建委的工作人员向中国山东网明确表示,选择全款购买还是贷款购买是购房人的基本权利,开发商不得刻意设置购房门槛限制购买,更不允许以捆绑地下室或者捆绑车位的形式进行销售,此类行为一经查处,济南市住建委将对该楼盘进行包括吊销预售证,拉入诚信黑名单等一系列处罚,维护济南房地产市场的平稳。''' stopWordsPath = os.path.dirname( os.path.abspath(__file__)) + '/textteaser/trainer/stopWords.txt' tt = TextTeaser(stopWordsPath, text) sentences = tt.summarize(title, text) for sentence in sentences: print(sentence)
def generate_summary_textteaser(self, input_text): tt = TextTeaser('TextTeaserApiTest') return tt.summarize(text=input_text, title='Test', url=None)