def login(self): """ dest 'phone' password 'md5(pwd)' autoLogin 0 validateCode :return: :rtype: """ form = { 'dest': cfg.get('dongao.dest'), 'password': cfg.get('dongao.password'), 'autoLogin': 0, } self.sess.post(M['login'], data=form, headers=self.post_headers) self.dump_cookies(self.sess.cookies)
def crawl_all_page(self): """ - 获取总页数 - 启动多个进程爬取 - 写入已完成的页数到 db :return: """ def get_total_num(): url = M['index'].format(1) raw = self.bs4markup(self.do_get(url)) pagi = raw.find('div', 'pagination') return pagi.find_all('a')[-2] def crawl_each(idx): return self.get_page(idx) # print('CRAWL: {}'.format(idx)) # return [] def update_done_num(idx): """ 更新已完成数字到 ``redis`` 中 :param idx: :return: """ abc.update_cfg('xici.last_index', idx) # print('DONE: {}'.format(idx)) # return idx # total = get_total_num() total = 2564 st = 0 each_size = 100 # dat = [] while True: start_str = 'START@{}'.format(st * each_size) for i in tqdm(range(st * each_size, (st + 1) * each_size), ascii=True, desc=start_str): if i <= int(cfg.get('xici.last_index', 0)): continue if i > total: return _ = crawl_each(i) if _: self.save_to_db(_) update_done_num(i) a = abc.randint(6, 14) time.sleep(a) st += 1
def get_free_pages(self): for i in tqdm(range(1963), ascii=True): i += 1 if i <= int(cfg.get('kdl.last_index', 0)): continue log.debug('START@({})'.format(i)) dat = self.get_page(M['free_index'], i) if dat: self.save_to_db(dat, 'kdl.free.proxies') abc.update_cfg('kdl.last_index', i) time.sleep(abc.randint(5, 10))
def run(pic_idx, offset, clear, max_proc, update, which, total, base_dir, newest): can_clear = False if base_dir: can_clear = True abc.update_cfg('mzt.base_dir', base_dir) else: base_dir = cfg.get('mzt.base_dir') mz = Mz(base_dir=base_dir) if newest: mz.only_fetch_latest() return if which: log.debug('download <{}>'.format(which)) for w in which.split(','): mz.download_by_index(w) return if update: log.debug('update local cache') mz.get_all() return if clear: if not can_clear: log.error('you must give base_dir to clear 0-byte images.') return log.debug('do clear empty image files.') mz.clear_empty_img(base_dir, do_clear=False) return mz.use_cache_indexes() start_index = 0 if pic_idx: start_index = mz.all_indexes.index(pic_idx) if offset: start_index = int(offset) end_index = -1 if total: end_index = start_index + total all_idx = mz.all_indexes[start_index:end_index] _m = max_proc if max_proc else cpu_count() pool = Pool(processes=_m) pool.map(mz.download_by_index, all_idx)
def update_albums(self, uid='1915268965'): """ 依据 ``uid`` 来获取该用户的所有照片, 并同步写入数据库中. 专辑数量较小, 且专辑内图片数量一直更新, 故需要采取每次更新的方式写入 :param uid: :type uid: str/dict :return: :rtype: """ if isinstance(uid, dict): uid = uid['uid'] page_count = cfg.get('weibo.album_page_count', 20) params = { 'uid': uid, 'count': page_count, } def fetch_album(page): page['index'] += 1 params['page'] = page['index'] raw = self.sess.get(M['album'], params=params, headers=self.get_headers) albums_dat = raw.json() albums = albums_dat['data'] _total_albums = albums['total'] page['updated'] += len(albums['album_list']) for album_ in albums['album_list']: wb_mg_doc.album_update(album_) return albums['album_list'] == _total_albums page = { 'index': 0, 'updated': 0, } log.debug('Try Update ({})'.format(uid)) while fetch_album(page): fetch_album(page) time.sleep(abc.randint(5, 9)) log.debug('Success update ({}) albums info'.format(page['updated']))
import os import sys app_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(app_root) if sys.version_info[0] < 3: reload(sys) sys.setdefaultencoding('utf-8') from mongoengine import * from base.abc import cfg mg_cfg = { 'host': cfg.get('mg.host', 'localhost'), 'port': cfg.get('mg.port', 27027), 'db': cfg.get('mg.db', 'luoo'), 'alias': cfg.get('mg.alias', 'luoo_rw'), 'username': cfg.get('mg.username', ''), 'password': cfg.get('mg.password', ''), } connect(**mg_cfg) class Volumes(Document): """期刊音乐详情 """ vol_index = IntField()
def run(tag_group, page_idx, tag_idx, name, tag2name, from_tag_idx, from_page_idx, total, clear, update, base_dir): global dir_pre if base_dir: abc.update_cfg('d4.base_dir', base_dir) else: base_dir = cfg.get('d4.base_dir') # 初始化存储目录 d4 = D4(base_dir) d4.load_cache() tags = d4.all_tags[tag_group] if update: log.debug('update local cache') d4.update_tags() return if clear: log.debug('do clear empty image files.') d4.clear_empty_img(base_dir, do_clear=False) return if tag_idx: d4.download_by_tag(tag_idx, down=False) return if from_page_idx: idx = d4.get_tag_position(from_page_idx, tag_group, False) print(idx) return if from_tag_idx: idx = d4.get_tag_position(from_tag_idx, tag_group) print(idx) return if page_idx: d4.download_only_from_index(page_idx) return current = cfg.get('d4.tag_index', 0) max_t = -1 if total: max_t = current + total if tag2name: for t in tags: # tidx = d4.gen_fd_by_tag(t['src']) if t.get('src') == tag2name: print(t) os._exit(-1) if name: dir_pre = name for t in tags: if t.get('name') == name: d4.download_by_name(t) break # log.error('no data by name({})'.format(name)) return for tag in tags[current:max_t]: log.debug('Try:{} @ {}'.format(current, tag['src'])) d4.download_by_tag(tag['src']) log.info('DONE@{}'.format(tag['src'])) current += 1 abc.update_cfg('d4.tag_index', current) time.sleep(1)
sys.setdefaultencoding('utf-8') from mongoengine import connect import pymongo from pymongo import MongoClient from pymongo import errors as pyerrors import redis from izen import rds from base.abc import cfg # 映射出 rds mg_cfg = { 'host': cfg.get('mg.host', 'localhost'), 'port': cfg.get('mg.port', 27027), 'db': cfg.get('mg.db', 'luoo'), 'alias': cfg.get('mg.alias', 'luoo_rw'), 'username': cfg.get('mg.username', ''), 'password': cfg.get('mg.password', ''), } connect(**mg_cfg) MG_CONN = MongoClient('mongodb://{}:{}/{}'.format( cfg.get('mg.host', 'localhost'), cfg.get('mg.port', 27027), cfg.get('mg.db', 'luoo'), )) LUOO_DB = MG_CONN[cfg.get('mg.db', 'luoo')]
def update_photos(self, album_info, init_photos=False): """ 获取某个专辑下的所有照片信息, 并写入数据库中. - 如果是初始化, 则批量写入, 忽略错误. - 否则, 更新模式, 只更新大于查询最后一次记录的时间戳的数据 :param album_info: :type album_info: dict :param init_photos: 是否初始化 :type init_photos: :return: :rtype: """ page_count = cfg.get('weibo.photo_page_count', 32) params = { 'uid': album_info['uid'], 'album_id': album_info['album_id'], 'type': album_info.get('album_type', 3), 'count': page_count, 'page': 1, } # 如果更新相册照片时异常结束, 则会从最后一次有效记录位置开始更新 # TODO: 修改为保持进度到数据库中. _max_page = album_info['count']['photos'] // page_count # 最后一次更新的时间戳, 设置为0, 可以获取所有数据 latest_ts = 0 if not init_photos: # 如果不是初始化, 则尝试从数据库获取 last_doc = wb_mg_doc.WeiboPhotos.objects( __raw__={ 'album_id': album_info['album_id'] }).first() if last_doc: latest_ts = last_doc.timestamp # start = 0 while True: suc = 'DONE' # start += 1 # params['page'] = start raw = self.sess.get(M['photo'], params=params, headers=self.get_headers) log.debug('try crawl: {}'.format(raw.url)) photos = raw.json() raw_p_list = photos['data']['photo_list'] # 如果非初始化模式, 则依据时间戳过滤掉已经下载的记录 if init_photos: p_list = raw_p_list else: p_list = [x for x in raw_p_list if x['timestamp'] > latest_ts] if not p_list: log.debug('All records has Updated already!!!') return aff_row = dbstore.batch_write(p_list, 'weibo.photo.details') # 非初始化模式下, if not init_photos: # 如果原始数据与过滤后的数据长度不一致, 则已经更新到最新记录. if len(p_list) != len(raw_p_list): log.info('All Photos records updated!!!') return # 如果待更新的记录与实际更新记录数目不相同也为已更新到最新记录 if len(p_list) != aff_row: log.info('DONE@ ({}/{})'.format(len(p_list), aff_row)) return if not aff_row: suc = 'SKIP' # 如果当前页面数与最大值相同, 则结束 if params['page'] >= _max_page: return # 使用等待延迟, 防止快速爬取导致被禁止 _ri = abc.randint(2, 7) log.debug('{}: ({}) {}/{}, and sleep {}s'.format( suc, len(p_list), params['page'], _max_page, _ri)) params['page'] += 1 for _ in tqdm(range(_ri), ascii=True, desc='sleep {}s'.format(_ri)): time.sleep(1)
def login(self, username='', password=''): form = { 'entry': 'weibo', 'gateway': '1', 'from': '', 'savestate': '7', 'qrcode_flag': False, 'useticket': '1', 'pagerefer': 'https://login.sina.com.cn/crossdomain2.php?action=logout&r=https%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl%3D%252F', 'vsnf': 1, 'service': 'miniblog', 'pwencode': 'rsa2', 'sr': '1280*800', 'encoding': 'UTF-8', 'prelt': '41', 'url': 'https://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack', 'returntype': 'META', 'sp': '3d841d27085a2fac6f5218f18f4ce5caf3cb020c49bc109608dc106f3e14cf8354e41ad03444672f492b490f81f155fff9364f2dff86677429e5b745fbe4ccffadd5458a0a6f29a5d769c63801753b49b3eaf7b52489da04a79f0724b9842c9aac7f546d2eb037d44484bd0ad3c8ac35ba9136a1eceaaa59587168881dc06f3b', 'servertime': '1512958714', 'nonce': 'ZIVLK5', 'rsakv': '1330428213', } dat = self.pre_login(username=username, password=password) form = dict(form, **dat) log.debug('STEP1: get {}'.format(M['login'])) res = self.sess.post(M['login'], data=form, headers=self.post_headers) # 分析 login.php 返回信息的重定向 url pa = r'location\.replace\([\'"](.*?)[\'"]\)' loop_url = re.findall(pa, res.content.decode('GBK'))[0] log.debug('STEP2: get {}'.format(loop_url)) # 获取返回第一次重定向 url 的返回信息 res = self.sess.get(loop_url) # 返回信息分两部分, 第一部分 setCrossDomainUrlList 出现 302 Moved Temporarily 错误, 故跳过 # 只取返回信息的第二部分 解析方式同 login.php 返回结果 final_url = re.findall(pa, res.content.decode('GBK'))[0] log.debug('STEP3: get {}'.format(final_url)) res = self.sess.get(final_url) uuid_pa = r'"uniqueid":"(.*?)"' uuid_res = re.findall(uuid_pa, res.text, re.S)[0] log.debug('STEP4:user_id: {}'.format(uuid_res)) url = M['profile'].format(uuid_res) raw = self.sess.get(url) def get_config(raw_mark): _START = '<!-- $CONFIG -->' _END = '<!-- / $CONFIG -->' return raw_mark.split(_START)[1].split(_END)[0] user_config = get_config(raw.text) user_config = self.dump_person_config(user_config) helper.write_file(json.dumps(user_config), 'personal.txt') raw = self.bs4markup(raw.text) log.debug('STEP5: title : {}'.format(raw.find('title').text)) abc.update_cfg('weibo.nickname', raw.find('title').text.replace('的微博_微博', '')) log.info('[LOGIN:SUCCESS] {}({})'.format(cfg.get('weibo.nickname'), username)) self.dump_cookies(self.sess.cookies)
def run( search, login, name, big_head, img_cache_dir, skip_cache, img_height, log_level, test, init, update_personal_info, ): img_height = img_height or cfg.get('weibo.img_height', 3) log_level = log_level or cfg.get('weibo.log_level', 1) img_cache_dir = img_cache_dir or cfg.get('weibo.img_cache_dir', '/tmp/weibo') big_head = big_head or cfg.get('weibo.big_head', False) skip_cache = skip_cache or cfg.get('weibo.skip_cache', False) wb = Wb(big_head=big_head, img_cache_dir=img_cache_dir, img_height=img_height, use_cache=not skip_cache) user_details = {} logzero.loglevel(log_level * 10) if login: username = click.prompt('username', type=str) password = click.prompt('password', type=str, hide_input=True) wb.login(username, password) abc.force_quit() # 加载 cookies wb.sess.cookies = wb.load_cookies() wb.who_am_i() if not wb.sess.cookies: log.warn('no cookie found!, login first!!!') abc.force_quit() # 测试 cookie 是否可用 if test: wb.is_cookie_ok() abc.force_quit() # 由账号名查询相关信息 if search: user_details = wb.click_fn_search( 'do_search_user', search, ) # 查询本地数据库数据 if name: user_details = wb.click_fn_search( 'load_user_info', name, ) if not name and not search: if update_personal_info: wb.update_personal_info() user_details = wb.click_fn_search( 'load_user_info', wb.personal_info.get('nick'), auto_select=True, ) if user_details: # 执行用户功能操作 wb.click_fn_user_actions(user_details) # 更新用户照片到本地数据库中 wb.click_fn_update_photos(user_details, init)