def handle(self, *args, **options): crawl_info = CrawlInfo(init_url=options['url'][0], limit=options['limit']) crawl_info.save() if not os.path.exists("managed_data/crawled_publications/%d" % crawl_info.id): os.makedirs("managed_data/crawled_publications/%d" % crawl_info.id) start_crawl.delay(crawl_info.id, options['N'][0])
def handle(self, *args, **options): crawl_info = CrawlInfo(init_url=options['url'][0], limit=options['limit'], type="author", i_limit=0, o_limit=options['branch_factor'][0]) crawl_info.save() if not os.path.exists("managed_data/crawled_authors/%d" % crawl_info.id): os.makedirs("managed_data/crawled_authors/%d" % crawl_info.id) author_id = InformationDownloader.get_researcher_id_from_url(crawl_info.init_url) crawl_author_pages.delay(crawl_info.id, author_id)
def crawl_author_page(request): if request.GET.get('urls') is not None: urls = request.GET.get('urls').split("\n") crawl_info = CrawlInfo(init_url=request.GET.get('urls'), limit=request.GET.get('limit'), i_limit=0, o_limit=request.GET.get('branch_factor'), type='author') crawl_info.save() if not os.path.exists("managed_data/crawled_authors/%d" % crawl_info.id): os.makedirs("managed_data/crawled_authors/%d" % crawl_info.id) for url in urls: author_id = InformationDownloader.get_researcher_id_from_url(url) crawl_author_pages.delay(crawl_info.id, author_id) return redirect("/crawl/status/%d/" % crawl_info.id) return render(request, 'crawl_authors.html')
def crawl_page(request): if request.GET.get('urls') is not None: urls = request.GET.get('urls').split("\n") crawl_info = CrawlInfo(init_url=request.GET.get('urls'), limit=request.GET.get('limit'), i_limit=request.GET.get('in_degree_limit'), o_limit=request.GET.get('out_degree_limit')) crawl_info.save() if not os.path.exists("managed_data/crawled_publications/%d" % crawl_info.id): os.makedirs("managed_data/crawled_publications/%d" % crawl_info.id) for url in urls: if "/publication/" in url: crawl_publication_page.delay(crawl_info.id, InformationDownloader.get_publication_id_from_url(url)) else: start_crawl.delay(crawl_info.id, int(request.GET.get('out_degree_limit'))) return redirect("/crawl/status/%d/" % crawl_info.id) return render(request, 'crawl.html')