def get_preview_website(url, target_test, title, response): #TODO: REFACTORRR!!!!!! #TODO: check content-type, if video or article, do not run summarizer if get_domain(url) in SKIP_DOMAIN_SUMMARY: return '' if target_test: sentence_array = [] try: sentence_array = pyteaser.SummarizeUrl(url) except ZeroDivisionError: print ZeroDivisionError print 'in get_preview_website, problem with pyteaser.SummarizeUrl, submission.url: ', url pass except: print 'in get_preview_website, problem with pyteaser.SummarizeUrl, submission.url: ', url if not sentence_array and title and response and response.ok and 'text/html' in response.headers['content-type']: body_text = extract_article(response.text) try: sentence_array = pyteaser.Summarize(title, body_text) except ZeroDivisionError: print 'in get_preview_website, problem with pyteaser.Summarize, submission.url: ', url pass if sentence_array: valid_summary_debugger(sentence_array, target_test) if valid_summary(sentence_array, target_test): return join_sentence_array(sentence_array) return ''
def grab(location, keywords, publication, publication_date, title): goose = Goose() try: raw_article = goose.extract(url=location) description = raw_article.meta_description.encode("utf8") article = raw_article.cleaned_text.encode("utf8") split_keywords = keywords.split(',') summary = pyteaser.SummarizeUrl(location) output = json.dumps({ "title": title, "keywords": split_keywords, "publication": publication, "publication_date": publication_date, "description": description, "source": location, "article": article, "summary": summary }) logging.warning('Succesfully grabbed through Goose.') logging.warning('Location: %s, Publication: %s' % (location, publication)) return output except: logging.critical('Unable to get article through Goose.') logging.critical('Location: %s, Publication: %s' % (location, publication)) return None
def main(): submissions = getSubmissions() done = getDone() counts = 0 #how many comments made this round for submission in submissions: if counts >= comments_per_run: break id = submission.id point = submission.ups - submission.downs if id not in done and point < thresh_max and point > thresh_min: putDone(submission.id) sentences = pyteaser.SummarizeUrl(submission.url) if (sentences != None): counts += 1 comment = formComment(sentences, submission) submission.add_comment(comment) print(comment)
def streaming_summarize_url(input_dict, widget, stream=None): import pyteaser summaries = pyteaser.SummarizeUrl(input_dict['url']) output_dict = {} output_dict['summary'] = " ".join(summaries) return output_dict
def __fetchUrl(self, url): # run() should catch the exceptions and handle them self.logger.debug('%s fetching page: %s' % (self.name, url)) res = pyteaser.SummarizeUrl( url) # TODO: use GENERIC_HEADERS and TIMEOUT return " ".join(res)