Пример #1
0
def get_cookie_opener(cookiefile=None, is_accept_ending=False, is_keepalive=False, ext_handlers=[]):
    cj = FirefoxCookieJar(policy=SessionCookiePolicy())
    if cookiefile:
        cj.load(cookiefile, ignore_expires=True, ignore_discard=True)
    return custom_dns_opener(
        cj, is_accept_ending=is_accept_ending, is_keepalive=is_keepalive, ext_handlers=ext_handlers
    )
Пример #2
0
def get_cookie_opener(cookiefile=None,
                      is_accept_ending=False,
                      is_keepalive=False,
                      ext_handlers=[]):
    cj = FirefoxCookieJar(policy=SessionCookiePolicy())
    if cookiefile:
        cj.load(cookiefile, ignore_expires=True, ignore_discard=True)
    return custom_dns_opener(cj,
                             is_accept_ending=is_accept_ending,
                             is_keepalive=is_keepalive,
                             ext_handlers=ext_handlers)
Пример #3
0
    has_multiprocessing = True
except:
    has_multiprocessing = False

logger = logging.getLogger('ProcessItemLogger')

FLAGS = gflags.FLAGS
gflags.DEFINE_string('sql', "", "additional sql, e.g. where a=b and c=d")
gflags.DEFINE_string('path', "/space/wwwroot/image.guang.j.cn/ROOT/images/", "image path")
gflags.DEFINE_string('org_path', "/space/wwwroot/image.guang.j.cn/ROOT/org_images/", "org image path")
gflags.DEFINE_string('crawl_path', "/tmp", "image path")
gflags.DEFINE_boolean('dryrun', False, "not run command")
gflags.DEFINE_boolean('force', False, "skip check status")

DEFAULT_UA="Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)"
urllib2.install_opener(custom_dns_opener())

def process_all_items():
    db = get_db_engine()

    last_time = 0
    sql = "select id,shop_id,local_pic_url,pic_url,manual_set,manual_updated_columns,status,num_id,pic_height,pic_width from item " + FLAGS.sql
    items = db.execute(sql)
    i = 0
    for item in items:
        i += 1
        process_item(item, items.rowcount, i)

def save_image(image_filename, data):
    if not os.path.exists(os.path.dirname(image_filename)) and not FLAGS.dryrun:
        make_dirs_for_file(image_filename)
Пример #4
0
    has_multiprocessing = True
except:
    has_multiprocessing = False

logger = logging.getLogger('MeiliCrawlLogger')

FLAGS = gflags.FLAGS
gflags.DEFINE_integer('itemid', 0, "start crawl id")
gflags.DEFINE_integer('group', 0, "define group*1000000 -> (group+1)*1000000")
gflags.DEFINE_integer('start', 2217, "start crawl id")
gflags.DEFINE_integer('end', 110538380, "end crawl id")
gflags.DEFINE_integer('interval', 0, "crawl interval between items")
gflags.DEFINE_boolean('commit', True, "is commit data into database?")

DEFAULT_UA = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)"
urllib2.install_opener(custom_dns_opener())
"""
CREATE TABLE `crawl_html` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `item_id` int(11) unsigned NOT NULL,
  `html` longtext,
  `last_modified` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
  PRIMARY KEY (`id`),
  UNIQUE KEY `item_id` (`item_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
"""

headers = {'Referer': "http://www.meilishuo.com", 'User-Agent': DEFAULT_UA}


def crawl_all():