def handle(self, *args, **options): crawl_info = CrawlInfo(init_url=options['url'][0], limit=options['limit']) crawl_info.save() if not os.path.exists("managed_data/crawled_publications/%d" % crawl_info.id): os.makedirs("managed_data/crawled_publications/%d" % crawl_info.id) start_crawl.delay(crawl_info.id, options['N'][0])
def crawl_page(request): if request.GET.get('urls') is not None: urls = request.GET.get('urls').split("\n") crawl_info = CrawlInfo(init_url=request.GET.get('urls'), limit=request.GET.get('limit'), i_limit=request.GET.get('in_degree_limit'), o_limit=request.GET.get('out_degree_limit')) crawl_info.save() if not os.path.exists("managed_data/crawled_publications/%d" % crawl_info.id): os.makedirs("managed_data/crawled_publications/%d" % crawl_info.id) for url in urls: if "/publication/" in url: crawl_publication_page.delay(crawl_info.id, InformationDownloader.get_publication_id_from_url(url)) else: start_crawl.delay(crawl_info.id, int(request.GET.get('out_degree_limit'))) return redirect("/crawl/status/%d/" % crawl_info.id) return render(request, 'crawl.html')