def generate_summary(topic, words): """Return summary of the topic subjected to word limit.""" print "Generate Summary %s" % topic def query_links(topic): query = urllib.urlencode({ "Query": "'" + topic + "'", "NewsSortBy": "'Relevance'", "$format": "json" }) url = API_ROOT + "?%s" % query r = requests.get(url, auth=('', API_KEY)) return r query_job = gevent.spawn(query_links, topic) gevent.joinall([query_job], 5000) result = query_job.value.json() links = [x["Url"] for x in result["d"]["results"]] # take only the first 4 links links = links[:4] lines = [] def download_and_clean(url): try: print "Download " + url article = Article(url) article.download() print "Parse " + url article.parse() text = article.text top_image = article.top_image except: print "Failed to get " + url text = "" top_image = "" return text, top_image jobs = [gevent.spawn(download_and_clean, url) for url in links[:4]] gevent.joinall(jobs, timeout=10) lines = [ job.value[0] for job in jobs if job.value and job.value[0] and len(job.value[0]) > 100 ] top_images = [job.value[1] for job in jobs if job.value and job.value[1]] gc.collect() try: summary = sumbasic.orig(lines, words) except ValueError: print "Generate Summary failed for " + str(links) traceback.print_exc() summary = "Generating summary failed" print "Generate Summary complete for " + str(links) return summary, top_images, links
def generate_summary(topic, words): """Return summary of the topic subjected to word limit.""" print("Generate Summary %s" % topic) def query_links(topic): query = urllib.parse.urlencode({ "q": "'" + topic + "'", "count": 4 }) headers = { "Ocp-Apim-Subscription-Key": API_KEY } url = API_ROOT + "?%s" % query r = requests.get(url, headers=headers) return r query_job = gevent.spawn(query_links, topic) gevent.joinall([query_job],5000) result = query_job.value.json() links = [x["url"] for x in result["value"]] names = [x["name"] for x in result["value"]] lines = [] def download_and_clean(url): try: print("Download " + url) article = Article(url) article.download() print("Parse " + url) article.parse() text = article.text top_image = article.top_image except: print("Failed to get " + url) text = "" top_image = "" return text, top_image jobs = [gevent.spawn(download_and_clean, url) for url in links[:4]] gevent.joinall(jobs, timeout=10) lines = [job.value[0] for job in jobs if job.value and job.value[0] and len(job.value[0]) > 100] top_images = [job.value[1] for job in jobs if job.value and job.value[1]] gc.collect() try: summary = sumbasic.orig(lines, words) except ValueError: print("Generate Summary failed for " + str(links)) traceback.print_exc() summary = "Generating summary failed" print("Generate Summary complete for " + str(links)) return summary, top_images, links, names, topic, words
def generate_summary(topic, words): """Return summary of the topic subjected to word limit.""" print "Generate Summary %s" % topic def query_links(topic): query = urllib.urlencode({ "Query": "'" + topic + "'", "NewsSortBy": "'Relevance'", "$format": "json" }) url = API_ROOT + "?%s" % query r = requests.get(url, auth=('', API_KEY)) return r query_job = gevent.spawn(query_links, topic) gevent.joinall([query_job],5000) result = query_job.value.json() # links = [x["Url"] for x in result["d"]["results"]] # take only the first 4 links # links = links[:4] links = ["http://www.reuters.com/article/us-somalia-blast-idUSKBN0F51XO20140630"] lines = [] def download_and_clean(url): try: print "Download " + url article = Article(url) article.download() print "Parse " + url article.parse() text = article.text top_image = article.top_image except: print "Failed to get " + url text = "" top_image = "" return text, top_image jobs = [gevent.spawn(download_and_clean, url) for url in links[:4]] gevent.joinall(jobs, timeout=10) lines = [job.value[0] for job in jobs if job.value and job.value[0] and len(job.value[0]) > 100] top_images = [job.value[1] for job in jobs if job.value and job.value[1]] gc.collect() try: summary = sumbasic.orig(lines, words) except ValueError: print "Generate Summary failed for " + str(links) traceback.print_exc() summary = "Generating summary failed" print "Generate Summary complete for " + str(links) return summary, top_images, links
def generate_summary(links, words): print "Generate Summary " + str(links) lines = [] for url in links[:4]: try: print "Download " + url article = Article(url) article.download() print "Parse " + url article.parse() if len(article.text) > 100: lines.append(article.text) except: print "Failed to get " + url continue gc.collect() summary = sumbasic.orig(lines, words) return summary