Exemplo n.º 1
0
def generate_summary(topic, words):
    """Return summary of the topic subjected to word limit."""
    print "Generate Summary %s" % topic

    def query_links(topic):

        query = urllib.urlencode({
            "Query": "'" + topic + "'",
            "NewsSortBy": "'Relevance'",
            "$format": "json"
        })
        url = API_ROOT + "?%s" % query
        r = requests.get(url, auth=('', API_KEY))
        return r

    query_job = gevent.spawn(query_links, topic)
    gevent.joinall([query_job], 5000)
    result = query_job.value.json()
    links = [x["Url"] for x in result["d"]["results"]]
    # take only the first 4 links
    links = links[:4]

    lines = []

    def download_and_clean(url):
        try:
            print "Download " + url
            article = Article(url)
            article.download()
            print "Parse " + url
            article.parse()
            text = article.text
            top_image = article.top_image
        except:
            print "Failed to get " + url
            text = ""
            top_image = ""
        return text, top_image

    jobs = [gevent.spawn(download_and_clean, url) for url in links[:4]]
    gevent.joinall(jobs, timeout=10)
    lines = [
        job.value[0] for job in jobs
        if job.value and job.value[0] and len(job.value[0]) > 100
    ]
    top_images = [job.value[1] for job in jobs if job.value and job.value[1]]

    gc.collect()
    try:
        summary = sumbasic.orig(lines, words)
    except ValueError:
        print "Generate Summary failed for " + str(links)
        traceback.print_exc()
        summary = "Generating summary failed"
    print "Generate Summary complete for " + str(links)
    return summary, top_images, links
Exemplo n.º 2
0
def generate_summary(topic, words):
    """Return summary of the topic subjected to word limit."""
    print("Generate Summary %s" % topic)

    def query_links(topic):

        query = urllib.parse.urlencode({
            "q": "'" + topic + "'",
            "count": 4
        })
        headers = {
            "Ocp-Apim-Subscription-Key": API_KEY
        }
        url = API_ROOT + "?%s" % query
        r = requests.get(url, headers=headers)
        return r

    query_job = gevent.spawn(query_links, topic)
    gevent.joinall([query_job],5000)
    result = query_job.value.json()
    links = [x["url"] for x in result["value"]]
    names = [x["name"] for x in result["value"]]

    lines = []

    def download_and_clean(url):
        try:
            print("Download " + url)
            article = Article(url)
            article.download()
            print("Parse " + url)
            article.parse()
            text = article.text
            top_image = article.top_image
        except:
            print("Failed to get " + url)
            text = ""
            top_image = ""
        return text, top_image

    jobs = [gevent.spawn(download_and_clean, url) for url in links[:4]]
    gevent.joinall(jobs, timeout=10)
    lines = [job.value[0] for job in jobs if job.value and job.value[0] and len(job.value[0]) > 100]
    top_images = [job.value[1] for job in jobs if job.value and job.value[1]]

    gc.collect()
    try:
        summary = sumbasic.orig(lines, words)
    except ValueError:
        print("Generate Summary failed for " + str(links))
        traceback.print_exc()
        summary = "Generating summary failed"
    print("Generate Summary complete for " + str(links))
    return summary, top_images, links, names, topic, words
Exemplo n.º 3
0
def generate_summary(topic, words):
    """Return summary of the topic subjected to word limit."""
    print "Generate Summary %s" % topic

    def query_links(topic):

        query = urllib.urlencode({
            "Query": "'" + topic + "'",
            "NewsSortBy": "'Relevance'",
            "$format": "json"
        })
        url = API_ROOT + "?%s" % query
        r = requests.get(url, auth=('', API_KEY))
        return r

    query_job = gevent.spawn(query_links, topic)
    gevent.joinall([query_job],5000)
    result = query_job.value.json()
    # links = [x["Url"] for x in result["d"]["results"]]
    # take only the first 4 links
    # links = links[:4]
    links = ["http://www.reuters.com/article/us-somalia-blast-idUSKBN0F51XO20140630"]

    lines = []

    def download_and_clean(url):
        try:
            print "Download " + url
            article = Article(url)
            article.download()
            print "Parse " + url
            article.parse()
            text = article.text
            top_image = article.top_image
        except:
            print "Failed to get " + url
            text = ""
            top_image = ""
        return text, top_image

    jobs = [gevent.spawn(download_and_clean, url) for url in links[:4]]
    gevent.joinall(jobs, timeout=10)
    lines = [job.value[0] for job in jobs if job.value and job.value[0] and len(job.value[0]) > 100]
    top_images = [job.value[1] for job in jobs if job.value and job.value[1]]

    gc.collect()
    try:
        summary = sumbasic.orig(lines, words)
    except ValueError:
        print "Generate Summary failed for " + str(links)
        traceback.print_exc()
        summary = "Generating summary failed"
    print "Generate Summary complete for " + str(links)
    return summary, top_images, links
Exemplo n.º 4
0
def generate_summary(links, words):
    print "Generate Summary " + str(links)
    lines = []
    for url in links[:4]:
        try:
            print "Download " + url
            article = Article(url)
            article.download()
            print "Parse " + url
            article.parse()
            if len(article.text) > 100:
                lines.append(article.text)
        except:
            print "Failed to get " + url
            continue

    gc.collect()

    summary = sumbasic.orig(lines, words)
    return summary