def handle(self, *args, **options):
        crawl_info = CrawlInfo(init_url=options['url'][0], limit=options['limit'])
        crawl_info.save()

        if not os.path.exists("managed_data/crawled_publications/%d" % crawl_info.id):
            os.makedirs("managed_data/crawled_publications/%d" % crawl_info.id)

        start_crawl.delay(crawl_info.id, options['N'][0])
示例#2
0
def crawl_page(request):
    if request.GET.get('urls') is not None:
        urls = request.GET.get('urls').split("\n")
        crawl_info = CrawlInfo(init_url=request.GET.get('urls'), limit=request.GET.get('limit'),
                               i_limit=request.GET.get('in_degree_limit'), o_limit=request.GET.get('out_degree_limit'))
        crawl_info.save()

        if not os.path.exists("managed_data/crawled_publications/%d" % crawl_info.id):
            os.makedirs("managed_data/crawled_publications/%d" % crawl_info.id)

        for url in urls:
            if "/publication/" in url:
                crawl_publication_page.delay(crawl_info.id, InformationDownloader.get_publication_id_from_url(url))
            else:
                start_crawl.delay(crawl_info.id, int(request.GET.get('out_degree_limit')))

        return redirect("/crawl/status/%d/" % crawl_info.id)

    return render(request, 'crawl.html')