def crawl(): crawler = Crawler() param_did = input("预先输入本用户cookie中的did值:") crawler.set_did(param_did) uid = input("输入此次要爬取的用户id:") crawler.add_to_list(uid) crawler.crawl() input("请按回车键退出......")
def crawl(): crawler = Crawler(False) crawler.set_did(param_did) crawler.crawl()
def main(): opts, args = parse_options() url = args[0] if opts.links: getLinks(url) raise SystemExit, 0 depth_limit = opts.depth_limit confine_prefix = opts.confine exclude = opts.exclude sTime = time.time() print >> sys.stderr, "Crawling %s (Max Depth: %d)" % (url, depth_limit) crawler = Crawler(url, depth_limit, confine_prefix, exclude) crawler.crawl() # create log directory if not os.path.exists(LOG_DIRECTORY): os.makedirs(LOG_DIRECTORY) num_links = 0 if opts.out_urls: for url_crawl in crawler.urls_seen: parsed_uri = urlparse.urlparse(url_crawl) # only base url if not re.match(".*%s" % parsed_uri.netloc.replace('www.', ''), url): # and not opts.skip_host: continue if not opts.out_path: print url_crawl else: domain = '{uri.netloc}'.format(uri=parsed_uri) log_file = "%s/%s.log" % (LOG_DIRECTORY, domain) logging.basicConfig( filename=log_file, filemode='w+', level=logging.DEBUG, format= '%(asctime)-15s [%(levelname)s] (%(threadName)-10s) %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') try: directory = opts.out_path + domain + '/' path = directory + toSeoFriendly(url_crawl, 50) + '.html' if not os.path.exists(directory): os.makedirs(directory) r = requests.get(url_crawl, allow_redirects=True, timeout=30) if not os.path.exists(path): target = open(path, 'w') target.write(r.text.encode('utf-8')) target.close() num_links = num_links + 1 logging.debug("Saving: {0}".format(url_crawl)) except IOError as e: logging.error("IOError: {0} {1}".format(url, e.message)) pass except Exception as e: logging.error("Error({0}): {1}".format( url, e.__doc__, e.message), exc_info=True) pass if opts.out_links: print "\n".join([str(l) for l in crawler.links_remembered]) if opts.out_dot: d = DotWriter() d.asDot(crawler.links_remembered) eTime = time.time() tTime = eTime - sTime print >> sys.stderr, "Found: %d" % num_links print >> sys.stderr, "Stats: (%d/s after %0.2fs)" % (int( math.ceil(float(num_links) / tTime)), tTime)
def main(): opts, args = parse_options() url = args[0] if opts.links: getLinks(url) raise SystemExit, 0 depth_limit = opts.depth_limit confine_prefix = opts.confine exclude = opts.exclude sTime = time.time() print >> sys.stderr, "Crawling %s (Max Depth: %d)" % (url, depth_limit) crawler = Crawler(url, depth_limit, confine_prefix, exclude) crawler.crawl() # create log directory if not os.path.exists(LOG_DIRECTORY): os.makedirs(LOG_DIRECTORY) num_links = 0 if opts.out_urls: for url_crawl in crawler.urls_seen: parsed_uri = urlparse.urlparse(url_crawl) # only base url if not re.match(".*%s" % parsed_uri.netloc.replace('www.', ''), url): # and not opts.skip_host: continue if not opts.out_path: print url_crawl else: domain = '{uri.netloc}'.format(uri=parsed_uri) log_file = "%s/%s.log" % (LOG_DIRECTORY, domain) logging.basicConfig( filename=log_file, filemode='w+', level=logging.DEBUG, format='%(asctime)-15s [%(levelname)s] (%(threadName)-10s) %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p' ) try: directory = opts.out_path + domain + '/' path = directory + toSeoFriendly(url_crawl, 50) + '.html' if not os.path.exists(directory): os.makedirs(directory) r = requests.get(url_crawl, allow_redirects=True, timeout=30) if not os.path.exists(path): target = open(path, 'w') target.write(r.text.encode('utf-8')) target.close() num_links = num_links + 1 logging.debug("Saving: {0}".format(url_crawl)) except IOError as e: logging.error("IOError: {0} {1}".format(url, e.message)) pass except Exception as e: logging.error("Error({0}): {1}".format(url, e.__doc__, e.message), exc_info=True) pass if opts.out_links: print "\n".join([str(l) for l in crawler.links_remembered]) if opts.out_dot: d = DotWriter() d.asDot(crawler.links_remembered) eTime = time.time() tTime = eTime - sTime print >> sys.stderr, "Found: %d" % num_links print >> sys.stderr, "Stats: (%d/s after %0.2fs)" % ( int(math.ceil(float(num_links) / tTime)), tTime)