示例#1
0
文件: dac.py 项目: wliustc/crawlers
 def login(self):
     """
     dest	'phone'
     password	'md5(pwd)'
     autoLogin	0
     validateCode
     :return:
     :rtype:
     """
     form = {
         'dest': cfg.get('dongao.dest'),
         'password': cfg.get('dongao.password'),
         'autoLogin': 0,
     }
     self.sess.post(M['login'], data=form, headers=self.post_headers)
     self.dump_cookies(self.sess.cookies)
示例#2
0
文件: xici.py 项目: wliustc/crawlers
    def crawl_all_page(self):
        """
            - 获取总页数
            - 启动多个进程爬取
            - 写入已完成的页数到 db
        :return:
        """
        def get_total_num():
            url = M['index'].format(1)
            raw = self.bs4markup(self.do_get(url))
            pagi = raw.find('div', 'pagination')
            return pagi.find_all('a')[-2]

        def crawl_each(idx):
            return self.get_page(idx)
            # print('CRAWL: {}'.format(idx))
            # return []

        def update_done_num(idx):
            """
                更新已完成数字到 ``redis`` 中

            :param idx:
            :return:
            """
            abc.update_cfg('xici.last_index', idx)
            # print('DONE: {}'.format(idx))
            # return idx

        # total = get_total_num()
        total = 2564
        st = 0
        each_size = 100

        # dat = []
        while True:
            start_str = 'START@{}'.format(st * each_size)
            for i in tqdm(range(st * each_size, (st + 1) * each_size),
                          ascii=True,
                          desc=start_str):
                if i <= int(cfg.get('xici.last_index', 0)):
                    continue

                if i > total:
                    return

                _ = crawl_each(i)
                if _:
                    self.save_to_db(_)
                    update_done_num(i)
                    a = abc.randint(6, 14)
                    time.sleep(a)
            st += 1
示例#3
0
文件: kdl.py 项目: wliustc/crawlers
    def get_free_pages(self):
        for i in tqdm(range(1963), ascii=True):
            i += 1
            if i <= int(cfg.get('kdl.last_index', 0)):
                continue
            log.debug('START@({})'.format(i))
            dat = self.get_page(M['free_index'], i)
            if dat:
                self.save_to_db(dat, 'kdl.free.proxies')
                abc.update_cfg('kdl.last_index', i)

            time.sleep(abc.randint(5, 10))
示例#4
0
文件: mzt.py 项目: wliustc/crawlers
def run(pic_idx, offset, clear, max_proc, update, which, total, base_dir, newest):
    can_clear = False

    if base_dir:
        can_clear = True
        abc.update_cfg('mzt.base_dir', base_dir)
    else:
        base_dir = cfg.get('mzt.base_dir')

    mz = Mz(base_dir=base_dir)
    if newest:
        mz.only_fetch_latest()
        return

    if which:
        log.debug('download <{}>'.format(which))
        for w in which.split(','):
            mz.download_by_index(w)
        return

    if update:
        log.debug('update local cache')
        mz.get_all()
        return

    if clear:
        if not can_clear:
            log.error('you must give base_dir to clear 0-byte images.')
            return
        log.debug('do clear empty image files.')
        mz.clear_empty_img(base_dir, do_clear=False)
        return

    mz.use_cache_indexes()

    start_index = 0
    if pic_idx:
        start_index = mz.all_indexes.index(pic_idx)
    if offset:
        start_index = int(offset)

    end_index = -1
    if total:
        end_index = start_index + total
    all_idx = mz.all_indexes[start_index:end_index]

    _m = max_proc if max_proc else cpu_count()
    pool = Pool(processes=_m)
    pool.map(mz.download_by_index, all_idx)
示例#5
0
    def update_albums(self, uid='1915268965'):
        """
            依据 ``uid`` 来获取该用户的所有照片, 并同步写入数据库中.
            专辑数量较小, 且专辑内图片数量一直更新, 故需要采取每次更新的方式写入

        :param uid:
        :type uid: str/dict
        :return:
        :rtype:
        """
        if isinstance(uid, dict):
            uid = uid['uid']

        page_count = cfg.get('weibo.album_page_count', 20)
        params = {
            'uid': uid,
            'count': page_count,
        }

        def fetch_album(page):
            page['index'] += 1
            params['page'] = page['index']
            raw = self.sess.get(M['album'],
                                params=params,
                                headers=self.get_headers)

            albums_dat = raw.json()
            albums = albums_dat['data']
            _total_albums = albums['total']

            page['updated'] += len(albums['album_list'])

            for album_ in albums['album_list']:
                wb_mg_doc.album_update(album_)

            return albums['album_list'] == _total_albums

        page = {
            'index': 0,
            'updated': 0,
        }
        log.debug('Try Update ({})'.format(uid))
        while fetch_album(page):
            fetch_album(page)
            time.sleep(abc.randint(5, 9))

        log.debug('Success update ({}) albums info'.format(page['updated']))
示例#6
0
import os
import sys

app_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(app_root)
if sys.version_info[0] < 3:
    reload(sys)
    sys.setdefaultencoding('utf-8')

from mongoengine import *

from base.abc import cfg

mg_cfg = {
    'host': cfg.get('mg.host', 'localhost'),
    'port': cfg.get('mg.port', 27027),
    'db': cfg.get('mg.db', 'luoo'),
    'alias': cfg.get('mg.alias', 'luoo_rw'),
    'username': cfg.get('mg.username', ''),
    'password': cfg.get('mg.password', ''),
}

connect(**mg_cfg)


class Volumes(Document):
    """期刊音乐详情

    """
    vol_index = IntField()
示例#7
0
def run(tag_group, page_idx, tag_idx, name, tag2name, from_tag_idx,
        from_page_idx, total, clear, update, base_dir):
    global dir_pre
    if base_dir:
        abc.update_cfg('d4.base_dir', base_dir)
    else:
        base_dir = cfg.get('d4.base_dir')

    # 初始化存储目录
    d4 = D4(base_dir)
    d4.load_cache()
    tags = d4.all_tags[tag_group]

    if update:
        log.debug('update local cache')
        d4.update_tags()
        return

    if clear:
        log.debug('do clear empty image files.')
        d4.clear_empty_img(base_dir, do_clear=False)
        return

    if tag_idx:
        d4.download_by_tag(tag_idx, down=False)
        return

    if from_page_idx:
        idx = d4.get_tag_position(from_page_idx, tag_group, False)
        print(idx)
        return

    if from_tag_idx:
        idx = d4.get_tag_position(from_tag_idx, tag_group)
        print(idx)
        return

    if page_idx:
        d4.download_only_from_index(page_idx)
        return

    current = cfg.get('d4.tag_index', 0)

    max_t = -1
    if total:
        max_t = current + total

    if tag2name:
        for t in tags:
            # tidx = d4.gen_fd_by_tag(t['src'])
            if t.get('src') == tag2name:
                print(t)
                os._exit(-1)

    if name:
        dir_pre = name
        for t in tags:
            if t.get('name') == name:
                d4.download_by_name(t)
                break
        # log.error('no data by name({})'.format(name))
        return

    for tag in tags[current:max_t]:
        log.debug('Try:{} @ {}'.format(current, tag['src']))
        d4.download_by_tag(tag['src'])
        log.info('DONE@{}'.format(tag['src']))
        current += 1
        abc.update_cfg('d4.tag_index', current)
        time.sleep(1)
示例#8
0
    sys.setdefaultencoding('utf-8')

from mongoengine import connect
import pymongo
from pymongo import MongoClient
from pymongo import errors as pyerrors
import redis

from izen import rds

from base.abc import cfg

# 映射出 rds

mg_cfg = {
    'host': cfg.get('mg.host', 'localhost'),
    'port': cfg.get('mg.port', 27027),
    'db': cfg.get('mg.db', 'luoo'),
    'alias': cfg.get('mg.alias', 'luoo_rw'),
    'username': cfg.get('mg.username', ''),
    'password': cfg.get('mg.password', ''),
}
connect(**mg_cfg)

MG_CONN = MongoClient('mongodb://{}:{}/{}'.format(
    cfg.get('mg.host', 'localhost'),
    cfg.get('mg.port', 27027),
    cfg.get('mg.db', 'luoo'),
))
LUOO_DB = MG_CONN[cfg.get('mg.db', 'luoo')]
示例#9
0
    def update_photos(self, album_info, init_photos=False):
        """
            获取某个专辑下的所有照片信息, 并写入数据库中.

            - 如果是初始化, 则批量写入, 忽略错误.
            - 否则, 更新模式, 只更新大于查询最后一次记录的时间戳的数据

        :param album_info:
        :type album_info: dict
        :param init_photos: 是否初始化
        :type init_photos:
        :return:
        :rtype:
        """
        page_count = cfg.get('weibo.photo_page_count', 32)
        params = {
            'uid': album_info['uid'],
            'album_id': album_info['album_id'],
            'type': album_info.get('album_type', 3),
            'count': page_count,
            'page': 1,
        }
        # 如果更新相册照片时异常结束, 则会从最后一次有效记录位置开始更新
        # TODO: 修改为保持进度到数据库中.
        _max_page = album_info['count']['photos'] // page_count

        # 最后一次更新的时间戳, 设置为0, 可以获取所有数据
        latest_ts = 0

        if not init_photos:
            # 如果不是初始化, 则尝试从数据库获取
            last_doc = wb_mg_doc.WeiboPhotos.objects(
                __raw__={
                    'album_id': album_info['album_id']
                }).first()
            if last_doc:
                latest_ts = last_doc.timestamp

        # start = 0
        while True:
            suc = 'DONE'
            # start += 1
            # params['page'] = start
            raw = self.sess.get(M['photo'],
                                params=params,
                                headers=self.get_headers)
            log.debug('try crawl: {}'.format(raw.url))

            photos = raw.json()
            raw_p_list = photos['data']['photo_list']

            # 如果非初始化模式, 则依据时间戳过滤掉已经下载的记录
            if init_photos:
                p_list = raw_p_list
            else:
                p_list = [x for x in raw_p_list if x['timestamp'] > latest_ts]
            if not p_list:
                log.debug('All records has Updated already!!!')
                return

            aff_row = dbstore.batch_write(p_list, 'weibo.photo.details')

            # 非初始化模式下,
            if not init_photos:
                # 如果原始数据与过滤后的数据长度不一致, 则已经更新到最新记录.
                if len(p_list) != len(raw_p_list):
                    log.info('All Photos records updated!!!')
                    return
                # 如果待更新的记录与实际更新记录数目不相同也为已更新到最新记录
                if len(p_list) != aff_row:
                    log.info('DONE@ ({}/{})'.format(len(p_list), aff_row))
                    return

            if not aff_row:
                suc = 'SKIP'

            # 如果当前页面数与最大值相同, 则结束
            if params['page'] >= _max_page:
                return

            # 使用等待延迟, 防止快速爬取导致被禁止
            _ri = abc.randint(2, 7)
            log.debug('{}: ({}) {}/{}, and sleep {}s'.format(
                suc, len(p_list), params['page'], _max_page, _ri))
            params['page'] += 1
            for _ in tqdm(range(_ri), ascii=True,
                          desc='sleep {}s'.format(_ri)):
                time.sleep(1)
示例#10
0
    def login(self, username='', password=''):
        form = {
            'entry': 'weibo',
            'gateway': '1',
            'from': '',
            'savestate': '7',
            'qrcode_flag': False,
            'useticket': '1',
            'pagerefer':
            'https://login.sina.com.cn/crossdomain2.php?action=logout&r=https%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl%3D%252F',
            'vsnf': 1,
            'service': 'miniblog',
            'pwencode': 'rsa2',
            'sr': '1280*800',
            'encoding': 'UTF-8',
            'prelt': '41',
            'url':
            'https://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
            'returntype': 'META',
            'sp':
            '3d841d27085a2fac6f5218f18f4ce5caf3cb020c49bc109608dc106f3e14cf8354e41ad03444672f492b490f81f155fff9364f2dff86677429e5b745fbe4ccffadd5458a0a6f29a5d769c63801753b49b3eaf7b52489da04a79f0724b9842c9aac7f546d2eb037d44484bd0ad3c8ac35ba9136a1eceaaa59587168881dc06f3b',
            'servertime': '1512958714',
            'nonce': 'ZIVLK5',
            'rsakv': '1330428213',
        }
        dat = self.pre_login(username=username, password=password)
        form = dict(form, **dat)

        log.debug('STEP1: get {}'.format(M['login']))
        res = self.sess.post(M['login'], data=form, headers=self.post_headers)

        # 分析 login.php 返回信息的重定向 url
        pa = r'location\.replace\([\'"](.*?)[\'"]\)'
        loop_url = re.findall(pa, res.content.decode('GBK'))[0]
        log.debug('STEP2: get {}'.format(loop_url))
        # 获取返回第一次重定向 url 的返回信息
        res = self.sess.get(loop_url)
        # 返回信息分两部分, 第一部分 setCrossDomainUrlList 出现 302 Moved Temporarily 错误, 故跳过
        # 只取返回信息的第二部分 解析方式同 login.php 返回结果
        final_url = re.findall(pa, res.content.decode('GBK'))[0]
        log.debug('STEP3: get {}'.format(final_url))

        res = self.sess.get(final_url)
        uuid_pa = r'"uniqueid":"(.*?)"'
        uuid_res = re.findall(uuid_pa, res.text, re.S)[0]
        log.debug('STEP4:user_id: {}'.format(uuid_res))

        url = M['profile'].format(uuid_res)
        raw = self.sess.get(url)

        def get_config(raw_mark):
            _START = '<!-- $CONFIG -->'
            _END = '<!-- / $CONFIG -->'
            return raw_mark.split(_START)[1].split(_END)[0]

        user_config = get_config(raw.text)
        user_config = self.dump_person_config(user_config)
        helper.write_file(json.dumps(user_config), 'personal.txt')

        raw = self.bs4markup(raw.text)

        log.debug('STEP5: title : {}'.format(raw.find('title').text))
        abc.update_cfg('weibo.nickname',
                       raw.find('title').text.replace('的微博_微博', ''))
        log.info('[LOGIN:SUCCESS] {}({})'.format(cfg.get('weibo.nickname'),
                                                 username))

        self.dump_cookies(self.sess.cookies)
示例#11
0
def run(
    search,
    login,
    name,
    big_head,
    img_cache_dir,
    skip_cache,
    img_height,
    log_level,
    test,
    init,
    update_personal_info,
):
    img_height = img_height or cfg.get('weibo.img_height', 3)
    log_level = log_level or cfg.get('weibo.log_level', 1)
    img_cache_dir = img_cache_dir or cfg.get('weibo.img_cache_dir',
                                             '/tmp/weibo')

    big_head = big_head or cfg.get('weibo.big_head', False)
    skip_cache = skip_cache or cfg.get('weibo.skip_cache', False)

    wb = Wb(big_head=big_head,
            img_cache_dir=img_cache_dir,
            img_height=img_height,
            use_cache=not skip_cache)
    user_details = {}
    logzero.loglevel(log_level * 10)

    if login:
        username = click.prompt('username', type=str)
        password = click.prompt('password', type=str, hide_input=True)
        wb.login(username, password)
        abc.force_quit()

    # 加载 cookies
    wb.sess.cookies = wb.load_cookies()
    wb.who_am_i()

    if not wb.sess.cookies:
        log.warn('no cookie found!, login first!!!')
        abc.force_quit()

    # 测试 cookie 是否可用
    if test:
        wb.is_cookie_ok()
        abc.force_quit()

    # 由账号名查询相关信息
    if search:
        user_details = wb.click_fn_search(
            'do_search_user',
            search,
        )

    # 查询本地数据库数据
    if name:
        user_details = wb.click_fn_search(
            'load_user_info',
            name,
        )

    if not name and not search:
        if update_personal_info:
            wb.update_personal_info()
        user_details = wb.click_fn_search(
            'load_user_info',
            wb.personal_info.get('nick'),
            auto_select=True,
        )

    if user_details:
        # 执行用户功能操作
        wb.click_fn_user_actions(user_details)
        # 更新用户照片到本地数据库中
        wb.click_fn_update_photos(user_details, init)