Пример #1
0
def start_crawler(config):
    #store the target itself
    global this_config
    this_config = config
    target = this_config['target']
    target_link_obj = Link(url=target, parent='', depth=0)
    add_obj_to_session(target_link_obj)
    
    #kick off the crawl
    crawl(target,target,0)
Пример #2
0
def crawl(url, parent_url, parent_link_level):
    res = request_url(url)
    if res:
        html_doc = res.read()
        child_links_list = get_child_links(html_doc)
        for link in child_links_list:
        #first make the url absolute if it is not already (checking if it has scheme already)
            link = link.encode('utf8')
            if urlparse(link).scheme == '':
                link = urljoin(parent_url, link)
            #store the link object (the link, its parent, its 'depth' or level
            link_obj = Link(url=link, parent=parent_url, depth=parent_link_level + 1)
            add_obj_to_session(link_obj)
            try:
                if parent_link_level < int(this_config['depth']):
                    crawl(link, url, parent_link_level + 1)
            except:
                print 'Error in crawl(), check that depth is a number in your config'