Python Crawler.crawl示例

def crawl():
    crawler = Crawler()

    param_did = input("预先输入本用户cookie中的did值：")
    crawler.set_did(param_did)

    uid = input("输入此次要爬取的用户id：")
    crawler.add_to_list(uid)

    crawler.crawl()

    input("请按回车键退出......")

示例#2

显示文件

def crawl():
    crawler = Crawler(False)
    crawler.set_did(param_did)
    crawler.crawl()

示例#3

显示文件

文件： crawler.py 项目： mshe666/python-simple-web-crawler

def main():

    opts, args = parse_options()

    url = args[0]

    if opts.links:
        getLinks(url)
        raise SystemExit, 0

    depth_limit = opts.depth_limit
    confine_prefix = opts.confine
    exclude = opts.exclude

    sTime = time.time()

    print >> sys.stderr, "Crawling %s (Max Depth: %d)" % (url, depth_limit)
    crawler = Crawler(url, depth_limit, confine_prefix, exclude)
    crawler.crawl()

    # create log directory
    if not os.path.exists(LOG_DIRECTORY):
        os.makedirs(LOG_DIRECTORY)

    num_links = 0
    if opts.out_urls:
        for url_crawl in crawler.urls_seen:

            parsed_uri = urlparse.urlparse(url_crawl)

            # only base url
            if not re.match(".*%s" % parsed_uri.netloc.replace('www.', ''),
                            url):  # and not opts.skip_host:
                continue

            if not opts.out_path:
                print url_crawl
            else:
                domain = '{uri.netloc}'.format(uri=parsed_uri)
                log_file = "%s/%s.log" % (LOG_DIRECTORY, domain)

                logging.basicConfig(
                    filename=log_file,
                    filemode='w+',
                    level=logging.DEBUG,
                    format=
                    '%(asctime)-15s [%(levelname)s] (%(threadName)-10s) %(message)s',
                    datefmt='%m/%d/%Y %I:%M:%S %p')

                try:
                    directory = opts.out_path + domain + '/'
                    path = directory + toSeoFriendly(url_crawl, 50) + '.html'

                    if not os.path.exists(directory):
                        os.makedirs(directory)

                    r = requests.get(url_crawl,
                                     allow_redirects=True,
                                     timeout=30)
                    if not os.path.exists(path):
                        target = open(path, 'w')
                        target.write(r.text.encode('utf-8'))
                        target.close()

                        num_links = num_links + 1
                        logging.debug("Saving: {0}".format(url_crawl))

                except IOError as e:
                    logging.error("IOError: {0} {1}".format(url, e.message))
                    pass

                except Exception as e:
                    logging.error("Error({0}): {1}".format(
                        url, e.__doc__, e.message),
                                  exc_info=True)
                    pass

    if opts.out_links:
        print "\n".join([str(l) for l in crawler.links_remembered])

    if opts.out_dot:
        d = DotWriter()
        d.asDot(crawler.links_remembered)

    eTime = time.time()
    tTime = eTime - sTime

    print >> sys.stderr, "Found:    %d" % num_links
    print >> sys.stderr, "Stats:    (%d/s after %0.2fs)" % (int(
        math.ceil(float(num_links) / tTime)), tTime)

示例#4

显示文件

文件： crawler.py 项目： Bigwayseo/python-simple-web-crawler

def main():   

    opts, args = parse_options()

    url = args[0]

    if opts.links:
        getLinks(url)
        raise SystemExit, 0

    depth_limit = opts.depth_limit
    confine_prefix = opts.confine
    exclude = opts.exclude

    sTime = time.time()

    print >> sys.stderr, "Crawling %s (Max Depth: %d)" % (url, depth_limit)
    crawler = Crawler(url, depth_limit, confine_prefix, exclude)
    crawler.crawl()

    # create log directory
    if not os.path.exists(LOG_DIRECTORY):
        os.makedirs(LOG_DIRECTORY)

    num_links = 0
    if opts.out_urls:
        for url_crawl in crawler.urls_seen:

            parsed_uri = urlparse.urlparse(url_crawl)
            
            # only base url
            if not re.match(".*%s" % parsed_uri.netloc.replace('www.', ''), url): # and not opts.skip_host:
                continue

            if not opts.out_path:
                print url_crawl
            else:
                domain = '{uri.netloc}'.format(uri=parsed_uri)
                log_file = "%s/%s.log" % (LOG_DIRECTORY, domain)

                logging.basicConfig(
                    filename=log_file,
                    filemode='w+',
                    level=logging.DEBUG,
                    format='%(asctime)-15s [%(levelname)s] (%(threadName)-10s) %(message)s',
                    datefmt='%m/%d/%Y %I:%M:%S %p'
                )
                
                try:
                    directory = opts.out_path + domain + '/'
                    path = directory + toSeoFriendly(url_crawl, 50) + '.html'
                    
                    if not os.path.exists(directory):
                        os.makedirs(directory)

                    r = requests.get(url_crawl, allow_redirects=True, timeout=30)
                    if not os.path.exists(path):
                        target = open(path, 'w')
                        target.write(r.text.encode('utf-8'))
                        target.close()

                        num_links = num_links + 1
                        logging.debug("Saving: {0}".format(url_crawl))

                except IOError as e:
                    logging.error("IOError: {0} {1}".format(url, e.message))
                    pass

                except Exception as e:
                    logging.error("Error({0}): {1}".format(url, e.__doc__, e.message), exc_info=True)
                    pass

    if opts.out_links:
        print "\n".join([str(l) for l in crawler.links_remembered])
        
    if opts.out_dot:
        d = DotWriter()
        d.asDot(crawler.links_remembered)

    eTime = time.time()
    tTime = eTime - sTime

    print >> sys.stderr, "Found:    %d" % num_links
    print >> sys.stderr, "Stats:    (%d/s after %0.2fs)" % (
            int(math.ceil(float(num_links) / tTime)), tTime)