Exemplo n.º 1
0
def WebscraperTask(job_id):
    """
    Celery task spawns process to perform scraping job.
    """
    logger.debug("Entering dispatch_wordscraper with job ID: %s" % str(job_id))

    # Obtain all required crawler arguments
    job = Job.objects.get(pk=job_id)
    word_list = [(term.term_id, term.term)
            for term in Term.objects.filter(job=job)]
    site_list = [(site.site_id, site.url, job.max_depth, 'English')
            for site in Site.objects.filter(job=job)]

    # Initialize the crawling process
    logger.debug("Configuring crawl controller with job_id: " +
            "%d, terms: %s, sites: %s" % (job_id, str(word_list),
            str(site_list),))
    controller = init_controller()
    controller.crawl(job_id, site_list, word_list)
    logger.debug("Crawl complete for job: %d" % job_id)

    # Cache the results in the database so the results are dated as close to the
    # crawl date as possible.
    results = controller.query(site_list, word_list)
    result_cache = Result()
    result_cache.job = job
    result_cache.output = results
    result_cache.save()
Exemplo n.º 2
0
def job_result(request, id):
    """
    Retrieve the results of the requested Word Scraper job in a CSV file.
    """
    try:
        job = Job.objects.get(result_id=id)
    except Job.DoesNotExist:
        raise Http404

    word_list = [(term.term_id, term.term)
            for term in Term.objects.filter(job=job)]
    site_list = [(site.site_id, site.url, job.max_depth, 'English')
            for site in Site.objects.filter(job=job)]
    controller = init_controller()
    results = Result.objects.get(job=job).output
    response = HttpResponse(content_type='text/csv')
    response['Content-Disposition'] = 'attachment; filename="%s_results.csv"' % job.job_id

    writer = csv.writer(response)
    writer.writerow(['url', 'accessed', 'depth', 'domain', 'site_id', 'status', 'term_matches'])
    for key in sorted(results, key=lambda x: (results[x]['site_id'], results[x]['depth'], x)):
        writer.writerow(format_row(key, results))
    return response