Пример #1
0
def crawlingController(request):
    crawling_spec = {}
    if request.method == 'POST':
        print request.FILES
        crawling_spec["login_script"] = request.FILES.get('login-script', None)
        crawling_spec["login_url"] = request.POST.get('login-url', "")
        crawling_spec["form_values_script"] = request.FILES.get('form-values-script', None)
        crawling_spec["base_address"] = request.POST.get('base-address', "")
        crawling_spec["start_url"] = request.POST.get('start-url', "")
        crawling_spec["black_list_urls"] = request.POST.get('black-list-urls', "")
        crawling_spec["scope_urls"] = request.POST.get('scope-urls', "")
        crawling_spec["wait_time"] = request.POST.get('wait-time', "")
        crawling_spec["depth"] = request.POST.get('depth', "100")
        #crawling_spec["proxy_address"] = request.POST.get('proxy-address', "")
        #print crawling_spec
        login_data = ""
        form_data = ""
        lines = crawling_spec['login_script'].readlines()
        for line in lines:
            login_data = login_data+line.strip()
        lines = crawling_spec['form_values_script'].readlines()
        for line in lines:
            form_data = form_data+line.strip()

        bs =  BeautifulSoup(form_data)
        print bs
        #print bs.findAll("tr") 
        obj = Crawl(login_script =  crawling_spec["login_script"], login_url = crawling_spec["login_url"] , \
                                    form_values_script = crawling_spec["form_values_script"] , \
                                    base_address =  crawling_spec["base_address"],start_url =  crawling_spec["start_url"], \
                                    black_list_urls =  crawling_spec["black_list_urls"], \
                                    scope_urls =  crawling_spec["scope_urls"], \
                                    wait_time =  crawling_spec["wait_time"], \
                                    depth = crawling_spec["depth"])
                                    #proxy_address = crawling_spec["proxy_address"])
        #print login_script, login_url, form_values_script, base_address, start_url, black_list_urls, scope_urls, wait_time
        obj.save()
        crawling_spec["login_script"] = login_data
        crawling_spec["form_values_script"] = form_data
        fsm = initializeParams(crawling_spec)
        #pathSourcetoSink(fsm, crawl, crawl.login_url)
        #print graph
        if fsm:
            returnJsonGraph(fsm.graph)
            number_of_nodes = fsm.graph.number_of_nodes()
            number_of_edges = len(fsm.graph.edges())
            nodes = getNodes(fsm.graph)
            edges = getEdges(fsm.graph)
            crawl = Crawl.objects.latest("id")
            pathSourcetoSink(fsm, crawl)    
            print crawl.id
            workflows = getWfs(crawl.id)
            print "workflows"
            print workflows
            #print edges
            #print nodes
            return render(request, 'run.html', {'num_nodes': number_of_nodes,'num_edges':number_of_edges, 'nodes': nodes, 'edges': edges, 'workflows': workflows})
        else:
            return render(request, "error.html") 
Пример #2
0
    def crawl_brands():
        """
        Get all offical recorded brands from eBay.
        """
        logging.info("Crawl cell phone brands from eBay")

        site = EbaySpider.get_site()
        crawl = Crawl()
        crawl.site = EbaySpider.name
        crawl.action = "crawl brands"
        crawl.status = 'SUCCESS'

        try:
            brands = EbaySpider.extract_data_from_ajax_request(EbaySpider.URLS['data']['brands'])
            for name in brands:
                count = brands[name]
                brands[name] = {}
                brands[name]['count'] = count
                url = EbaySpider.URLS['data']['model'] + "&" +  urllib.urlencode({'Brand': name})
                print(url)
                models = EbaySpider.extract_data_from_ajax_request(url)

                brands[name]['models'] = models
            site.brands = brands
        except Exception as e:
            logging.info(">>>> Fail to crawl brands from ebay:%s" % traceback.format_exc())
            msg = traceback.format_exc()
            crawl.status = msg[:1024]

        site.save()
        crawl.save()
Пример #3
0
def save_crawl(url):
    remove_crawl(url)
    session = Session()
    crawl = Crawl(url = url, ts = datetime.now())
    session.add(crawl)
    session.commit()
    crawl_id = crawl.id
    session.close()
    return crawl_id