def get_cookie_opener(cookiefile=None, is_accept_ending=False, is_keepalive=False, ext_handlers=[]): cj = FirefoxCookieJar(policy=SessionCookiePolicy()) if cookiefile: cj.load(cookiefile, ignore_expires=True, ignore_discard=True) return custom_dns_opener( cj, is_accept_ending=is_accept_ending, is_keepalive=is_keepalive, ext_handlers=ext_handlers )
def get_cookie_opener(cookiefile=None, is_accept_ending=False, is_keepalive=False, ext_handlers=[]): cj = FirefoxCookieJar(policy=SessionCookiePolicy()) if cookiefile: cj.load(cookiefile, ignore_expires=True, ignore_discard=True) return custom_dns_opener(cj, is_accept_ending=is_accept_ending, is_keepalive=is_keepalive, ext_handlers=ext_handlers)
has_multiprocessing = True except: has_multiprocessing = False logger = logging.getLogger('ProcessItemLogger') FLAGS = gflags.FLAGS gflags.DEFINE_string('sql', "", "additional sql, e.g. where a=b and c=d") gflags.DEFINE_string('path', "/space/wwwroot/image.guang.j.cn/ROOT/images/", "image path") gflags.DEFINE_string('org_path', "/space/wwwroot/image.guang.j.cn/ROOT/org_images/", "org image path") gflags.DEFINE_string('crawl_path', "/tmp", "image path") gflags.DEFINE_boolean('dryrun', False, "not run command") gflags.DEFINE_boolean('force', False, "skip check status") DEFAULT_UA="Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)" urllib2.install_opener(custom_dns_opener()) def process_all_items(): db = get_db_engine() last_time = 0 sql = "select id,shop_id,local_pic_url,pic_url,manual_set,manual_updated_columns,status,num_id,pic_height,pic_width from item " + FLAGS.sql items = db.execute(sql) i = 0 for item in items: i += 1 process_item(item, items.rowcount, i) def save_image(image_filename, data): if not os.path.exists(os.path.dirname(image_filename)) and not FLAGS.dryrun: make_dirs_for_file(image_filename)
has_multiprocessing = True except: has_multiprocessing = False logger = logging.getLogger('MeiliCrawlLogger') FLAGS = gflags.FLAGS gflags.DEFINE_integer('itemid', 0, "start crawl id") gflags.DEFINE_integer('group', 0, "define group*1000000 -> (group+1)*1000000") gflags.DEFINE_integer('start', 2217, "start crawl id") gflags.DEFINE_integer('end', 110538380, "end crawl id") gflags.DEFINE_integer('interval', 0, "crawl interval between items") gflags.DEFINE_boolean('commit', True, "is commit data into database?") DEFAULT_UA = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)" urllib2.install_opener(custom_dns_opener()) """ CREATE TABLE `crawl_html` ( `id` int(11) NOT NULL AUTO_INCREMENT, `item_id` int(11) unsigned NOT NULL, `html` longtext, `last_modified` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (`id`), UNIQUE KEY `item_id` (`item_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 """ headers = {'Referer': "http://www.meilishuo.com", 'User-Agent': DEFAULT_UA} def crawl_all():