def handle(self, *args, **options):
        crawl_info = CrawlInfo(init_url=options['url'][0], limit=options['limit'])
        crawl_info.save()

        if not os.path.exists("managed_data/crawled_publications/%d" % crawl_info.id):
            os.makedirs("managed_data/crawled_publications/%d" % crawl_info.id)

        start_crawl.delay(crawl_info.id, options['N'][0])
    def handle(self, *args, **options):
        crawl_info = CrawlInfo(init_url=options['url'][0], limit=options['limit'],
                               type="author", i_limit=0, o_limit=options['branch_factor'][0])
        crawl_info.save()

        if not os.path.exists("managed_data/crawled_authors/%d" % crawl_info.id):
            os.makedirs("managed_data/crawled_authors/%d" % crawl_info.id)

        author_id = InformationDownloader.get_researcher_id_from_url(crawl_info.init_url)
        crawl_author_pages.delay(crawl_info.id, author_id)
Пример #3
0
def crawl_author_page(request):
    if request.GET.get('urls') is not None:
        urls = request.GET.get('urls').split("\n")
        crawl_info = CrawlInfo(init_url=request.GET.get('urls'), limit=request.GET.get('limit'),
                               i_limit=0, o_limit=request.GET.get('branch_factor'), type='author')
        crawl_info.save()

        if not os.path.exists("managed_data/crawled_authors/%d" % crawl_info.id):
            os.makedirs("managed_data/crawled_authors/%d" % crawl_info.id)

        for url in urls:
            author_id = InformationDownloader.get_researcher_id_from_url(url)
            crawl_author_pages.delay(crawl_info.id, author_id)

        return redirect("/crawl/status/%d/" % crawl_info.id)

    return render(request, 'crawl_authors.html')
Пример #4
0
def crawl_page(request):
    if request.GET.get('urls') is not None:
        urls = request.GET.get('urls').split("\n")
        crawl_info = CrawlInfo(init_url=request.GET.get('urls'), limit=request.GET.get('limit'),
                               i_limit=request.GET.get('in_degree_limit'), o_limit=request.GET.get('out_degree_limit'))
        crawl_info.save()

        if not os.path.exists("managed_data/crawled_publications/%d" % crawl_info.id):
            os.makedirs("managed_data/crawled_publications/%d" % crawl_info.id)

        for url in urls:
            if "/publication/" in url:
                crawl_publication_page.delay(crawl_info.id, InformationDownloader.get_publication_id_from_url(url))
            else:
                start_crawl.delay(crawl_info.id, int(request.GET.get('out_degree_limit')))

        return redirect("/crawl/status/%d/" % crawl_info.id)

    return render(request, 'crawl.html')