Пример #1
0
    def load_post(self,
                  url,
                  data,
                  headers=None,
                  ret='json',
                  use_cache=True,
                  show_log=False):
        name = self.map_url_to_cache_id(url)
        if data:
            keys = sorted(data.keys())
            for k in keys:
                name += '{}{}'.format(k, data[k])

        raw = ''
        hit = False

        if use_cache:
            hit = True
            raw = self.load_from_cache(name)

        if not raw:
            if show_log:
                zlog.debug('cache miss: ({})'.format(name))
            raw = self.do_sess_post(url, data, headers, ret)
            if raw:
                if ret == 'json':
                    raw = json.dumps(raw)
                zlog.debug('write ({}) to {}'.format(len(raw), name))
                helper.write_file(raw, name)

        if show_log:
            zlog.debug('[cache {}:{:>8}] post {}'.format(
                'hit' if hit else 'miss', len(raw or ''), name))
        return raw
Пример #2
0
    def fetch_one(self, depth=1):
        r = self.bs4markup(self.sess.get(M['one']).text)

        script_raw = r.find('div', role='main').find('script')
        if not script_raw:
            return

        token = ''
        for line in script_raw.text.split('\n'):
            if line.find('token') != -1:
                token = line.split('\'')[1]
                break
        log.debug('ajaxlist token: {}'.format(token))

        all_data = []

        while depth:
            ajax_url = '{}{}?_token={}'.format(M['ajaxlist'], self.current_id,
                                               token)
            log.debug('fetch {}'.format(ajax_url))
            cnt = self.sess.get(ajax_url)
            dat = cnt.json()
            all_data += dat['data']
            self.current_id = all_data[-1]['id']
            time.sleep(1)
            depth -= 1

            if len(dat['data']) < 10:
                break

        helper.write_file(json.dumps(all_data), 'one.all.json')
        return all_data
Пример #3
0
    def load(self, url, use_cache=True, show_log=False):
        """fetch the url ``raw info``, use cache first, if no cache hit, try get from Internet

        :param url:
        :type url:
        :param use_cache:
        :type use_cache:
        :param show_log:
        :type show_log:
        :return: the ``raw info`` of the url
        :rtype: ``str``
        """
        _name = self.map_url_to_cache_id(url)
        raw = ''
        hit = False

        if use_cache:
            hit = True
            raw = self.load_from_cache(_name)

        if not raw:
            if show_log:
                zlog.debug('from cache got nothing {}'.format(_name))
            raw = self.do_sess_get(url)
            if raw:
                helper.write_file(raw, _name)

        # if not raw:
        #     hit = True
        #     raw = self.load_from_cache(_name)
        if show_log:
            zlog.debug('[{}:{:>8}] get {}'.format('Cache' if hit else 'Net',
                                                  len(raw), url))
        return raw
Пример #4
0
    def update_tags(self):
        """
            更新 44.style 的所有 tags 到本地缓存.

        :return:
        :rtype:
        """
        tags_root = []
        tags_raw = self.bs4get(D['tags'])
        if not tags_raw:
            log.error('cannot update tags.')
            return

        tagall = tags_raw.find_all('div', class_='tagall photo')
        for tag in tagall:
            tag_in = []
            ul_li = tag.ul.find_all('li')
            if ul_li:
                for li in ul_li:
                    tag_in.append({
                        'src': li.a.get('href'),
                        'name': li.a.text,
                    })
            tags_root.append(tag_in)

        if tags_root:
            helper.write_file(json.dumps(tags_root).encode(), 'd4.tags.json')
Пример #5
0
    def md2list(self):
        dat = []

        for line in self.txt.split('\n'):
            line = line.lstrip()
            if not line.startswith('*'):
                continue

            try:
                k, desc = line.split(':')
                if line.find('redis') != -1:
                    print(line)
                desc_, url_ = desc.rstrip().split('[官网](')

                dat.append({
                    'name': k.split('*')[1].lstrip().rstrip(),
                    'desc': desc_,
                    'url': url_[:-1],
                })
            except ValueError as _:
                pass
                # log.error('{} -> {}'.format(line, _))

        helper.write_file(json.dumps(dat),
                          'gitflat.awesome.json',
                          append=False)
        os._exit(-1)
Пример #6
0
    def dump_my_page_config(txt):
        """
            保存个人登陆信息到本地缓存

        :param txt:
        :type txt:
        :return:
        :rtype:
        """
        def get_config(raw_mark):
            """
                从 config 字段中解析出账号的信息

            :param raw_mark:
            :type raw_mark:
            :return:
            :rtype:
            """
            _START = '<!-- $CONFIG -->'
            _END = '<!-- / $CONFIG -->'
            return raw_mark.split(_START)[1].split(_END)[0]

        txt = get_config(txt)
        txt = [
            t[1:].rstrip() for t in txt.split('\n')
            if t and t.find('CONFIG') != -1 and t.find('var ') == -1
        ]
        dat = {}
        keys = [
            'oid',
            'page_id',
            'uid',
            'nick',
            'sex',
            'watermark',
            'domain',
            'lang',
            'skin',
            'avatar_large',
            'pid',
        ]

        for t in txt:
            k, v = t[:-1].split('=')
            k = k.split('\'')[1]
            if k not in keys:
                continue
            dat[k] = v.replace(
                '\'',
                '') if k != 'avatar_large' else 'http:' + v.replace('\'', '')

        helper.write_file(json.dumps(dat), base.app_pth['personal'])
        return dat
Пример #7
0
    def spawn(self, url, force_spawn=False):
        """use the url for creation of domain and fetch cookies

        - init cache dir by the url domain as ``<base>/domain``
        - save the cookies to file ``<base>/domain/cookie.txt``
        - init ``headers.get/post/json`` with response info
        - init ``site_dir/site_raw/site_media``

        :param url:
        :type url:
        :param force_spawn:
        :type force_spawn:
        :return:
        :rtype:
        """
        _url, domain = self.get_domain_home_from_url(url)
        if not _url:
            return False

        self.cache['site_dir'] = os.path.join(self.cache['base'], self.domain)
        for k in ['raw', 'media']:
            self.cache['site_' + k] = os.path.join(self.cache['site_dir'], k)
            helper.mkdir_p(self.cache['site_' + k], True)

        ck_pth = os.path.join(self.cache['site_dir'], 'cookie.txt')
        helper.mkdir_p(ck_pth)

        name = os.path.join(self.cache['site_raw'], 'homepage')
        # not force spawn and file ok
        if not force_spawn and helper.is_file_ok(name):
            # zlog.debug('{} exist!'.format(name))
            self.sess.cookies = self.load_cookies(ck_pth)
            return True
        else:
            zlog.debug('{} not exist!'.format(name))

        res = self.sess.get(url, headers=self.__header__)
        if res.status_code != 200:
            return False
        if res:
            helper.write_file(res.content, name)
        # self.load(url)

        for k, v in self.headers.items():
            self.headers[k] = res.request.headers

        self.dump_cookies(cookies=self.sess.cookies, save_to=ck_pth)

        return True
Пример #8
0
    def update_cache(self):
        pyres = self.main_page()
        flat = []

        for nfr in tqdm(pyres, ascii=True):
            if nfr.get('sub_res'):
                flat += nfr.get('sub_res')
                continue

            items = self.sub_page(nfr.get('url'))
            items = sorted(items, key=lambda s: s['name'])
            nfr['sub_res'] = items
            flat += nfr.get('sub_res')

        helper.write_file(json.dumps(pyres).encode(), 'jobble.json')
        helper.write_file(json.dumps(flat).encode(), 'jobble.flat.json')
Пример #9
0
    def update_tag_pages_cache(self, tag):
        """
            更新tag对应页面到本地缓存

        :param tag:
        :type tag:
        :return:
        :rtype:
        """
        _k = tag['src']
        if tag['src'] in json.loads(helper.read_file('d4.t2i.json')):
            log.debug('{} already got'.format(tag['src']))
            return

        pages = self.fetch_tags_pages_by_index(tag['src'])
        self.t2i[_k] = pages

        helper.write_file(json.dumps(self.t2i).encode(), 'd4.t2i.json')
Пример #10
0
    def dump_failure_songs(self, song, action='add'):
        dat = self.load_failure_songs() or []
        dat = [x for x in dat if x]
        if action == 'add':
            dat.append(song)
        elif action == 'del':
            if song in dat:
                dat.pop(dat.index(song))
        elif action == 'clear':
            dat = []
        else:
            zlog.error('unsupported action: ({})'.format(action))
            return

        dat = list(set(dat))
        dat = yaml.dump(dat)
        file_pth = os.path.expanduser(self.failure_store)
        helper.write_file(dat, file_pth)
Пример #11
0
    def __init__(self, pth, dat=None, enable_default_log=True):
        """初始化配置文件
        - 文件不存在: 将字典dat初始化到文件中
        - 文件存在: 以字典数据类型来初始化配置文件

        :param dat: ``字典类型``
        :type dat: dict
        :param pth: ``文件存储路径``
        :type pth: str
        :param enable_default_log: ``是否启用默认log配置参数``
        :type enable_default_log: bool
        """
        try:
            if not helper.is_file_ok(pth):
                helper.write_file('', pth)
            self._pth, t = os.path.split(pth)
            self._cfg_name = t.split('.')[0]
        except Exception as _:
            self._pth = '/tmp'
            self._cfg_name = 'izen'

        self.cfg = profig.Config(pth, encoding='utf-8')

        # 读取配置
        self.cfg.read()

        # 初始化默认log字段类型
        if enable_default_log:
            self.__spawn()

        # 初始化自定义字典
        if dat:
            self.__do_init(dat)

        # 在配置不存在时, 需要首先初始化在内存中, 然后再同步到本地并退出执行程序
        if not os.path.exists(os.path.expanduser(pth)):
            self.cfg.sync()
Пример #12
0
    def gen_lesson_url(self, chapters):
        dat = []
        for chap in tqdm(chapters, ascii=True):
            # get mp3 url
            res = self.sess.post(M['lecture_mp3'],
                                 data={'lectureId': chap['lid']})
            cp = {
                'lid': chap['lid'],
                'name': chap['name'],
                'mp3': unquote(res.json()['obj']),
            }
            time.sleep(1)

            # get doc url
            res = self.sess.post(M['lecture_doc'],
                                 data={'lectureId': chap['lid']})
            cp['doc'] = unquote(res.json()['obj'])
            dat.append(cp)

            rd = abc.randint(1, 3)
            log.debug('sleep {}'.format(rd))
            time.sleep(rd)

        helper.write_file(json.dumps(dat), self.catalog_info['cache_file'])
Пример #13
0
    def download_by_index(self, index):
        global cache_index
        global finished

        _e_list1 = ['33201', '34949', '35850', '45364', '47526']

        if not index:
            log.info('no index found...')
            return

        dat = self.get_page_by_index(index)
        if not dat:
            log.error('fail@none: {}'.format(index))
            return

        try:
            # 扩展名, 文件夹名, 自增名, 图片固定url
            _ext = dat['img_src'].split('.')[-1]
            _name_off = 3 + len(_ext)
            if index in _e_list1:
                _name_off = 4 + len(_ext)
            img = dat['img_src'][:-_name_off]

            if index == '54856':
                _name_pre = ''
                fd_img = 'a'
            elif index in _e_list1:
                fd_img, _ = get_fd_name(dat['img_src'])
                _name_pre = ''
            else:
                _name_pre = img.split('/')[-1]
                fd_img = img[-1]
            _path_local = os.path.join(dat['time'], fd_img)

            fd = os.path.join(self.base_dir, _path_local)
            helper.mkdir_p(fd, True)
            os.chdir(fd)

            if index in _e_list1:
                _img_fmt = '{}{}1.{}'
            else:
                _img_fmt = '{}{}.{}'

            params = [
                {
                    'img_url': _img_fmt.format(img, str(x + 1).zfill(2), _ext),
                    'title': '{}-{}{}.{}'.format(
                        dat['name'].decode(),
                        _name_pre,
                        str(x + 1).zfill(2),
                        _ext,
                    )
                }
                for x in range(dat['total'])
            ]

            _fail_count = 0
            for para in tqdm(params, ascii=True, desc='%8s ✈ %10s' % (index, _path_local)):
                rs = self.download_and_save(para)
                if rs == self.save_status['fail']:
                    _fail_count += 1
                    time.sleep(0.5)
                elif rs == self.save_status['ok']:
                    time.sleep(1.5)
                elif rs == self.save_status['skip']:
                    # 如果本地文件已存在, 则不进行等待
                    time.sleep(0.0001)

                if _fail_count > 5:
                    log.warn('fail@5 img of this, skip({}) => ({})'.format(index, _path_local))
                    break

            cache_index += 1
            finished.append(index)
            helper.write_file(json.dumps(finished), '/tmp/mz.done')
            log.warn('Done:({}/{})'.format(cache_index, index))
        except TypeError as _:
            log.error('fail@type: {}'.format(index))
Пример #14
0
    def md2dict(self):
        """
        :return:
        """
        flat_dat = []
        dat = []

        h3 = {'name': '', 'desc': '', 'groups': []}
        lis = {
            'cate': '',
            'libs': [],
        }
        line_count = 0
        for lno, line in enumerate(self.txt.split('\n')):
            line = line.lstrip()
            if not line:
                continue

            line_count += 1
            if line.startswith('*'):
                try:
                    k, desc = line.split(':')
                    desc_, url_ = desc.rstrip().split('[官网](')

                    d = {
                        'name': k.split('*')[1].lstrip().rstrip(),
                        'desc': desc_,
                        'url': url_[:-1],
                    }
                    # if d['name'] == 'hot-redis':
                    #     print(d)
                    flat_dat.append(d)
                    lis['libs'].append(d)
                except ValueError as _:
                    lis = {
                        'cate':
                        '*'.join(line.split('*')[1:]).lstrip().rstrip(),
                        'libs': [],
                    }
                    # log.error('({}){} -> {}'.format(lno, line, _))
                continue

            if line.startswith('###'):
                if h3.get('name'):
                    h3['groups'].append(lis)
                    dat.append(h3)
                    lis = {
                        'cate': '',
                        'libs': [],
                    }
                    # print(dat)
                    # print(line_count)
                    # os._exit(-1)
                h3 = {
                    'name': line.split('###')[-1].lstrip().rstrip(),
                    'desc': '',
                    'groups': [],
                }
                continue

            h3['desc'] = line.lstrip().rstrip()

        helper.write_file(json.dumps(dat), 'github.awesome.json')
        helper.write_file(json.dumps(flat_dat), 'gitflat.awesome.json')
        dl_l = dbstore.rds.List(key='awesome.todo.libs')
        dl_l += dat
        os._exit(-1)
Пример #15
0
def save_img(dat, pth):
    if not dat:
        return
    helper.write_file(dat, pth)
Пример #16
0
    def login(self, username='', password=''):
        form = {
            'entry': 'weibo',
            'gateway': '1',
            'from': '',
            'savestate': '7',
            'qrcode_flag': False,
            'useticket': '1',
            'pagerefer':
            'https://login.sina.com.cn/crossdomain2.php?action=logout&r=https%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl%3D%252F',
            'vsnf': 1,
            'service': 'miniblog',
            'pwencode': 'rsa2',
            'sr': '1280*800',
            'encoding': 'UTF-8',
            'prelt': '41',
            'url':
            'https://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
            'returntype': 'META',
            'sp':
            '3d841d27085a2fac6f5218f18f4ce5caf3cb020c49bc109608dc106f3e14cf8354e41ad03444672f492b490f81f155fff9364f2dff86677429e5b745fbe4ccffadd5458a0a6f29a5d769c63801753b49b3eaf7b52489da04a79f0724b9842c9aac7f546d2eb037d44484bd0ad3c8ac35ba9136a1eceaaa59587168881dc06f3b',
            'servertime': '1512958714',
            'nonce': 'ZIVLK5',
            'rsakv': '1330428213',
        }
        dat = self.pre_login(username=username, password=password)
        form = dict(form, **dat)

        log.debug('STEP1: get {}'.format(M['login']))
        res = self.sess.post(M['login'], data=form, headers=self.post_headers)

        # 分析 login.php 返回信息的重定向 url
        pa = r'location\.replace\([\'"](.*?)[\'"]\)'
        loop_url = re.findall(pa, res.content.decode('GBK'))[0]
        log.debug('STEP2: get {}'.format(loop_url))
        # 获取返回第一次重定向 url 的返回信息
        res = self.sess.get(loop_url)
        # 返回信息分两部分, 第一部分 setCrossDomainUrlList 出现 302 Moved Temporarily 错误, 故跳过
        # 只取返回信息的第二部分 解析方式同 login.php 返回结果
        final_url = re.findall(pa, res.content.decode('GBK'))[0]
        log.debug('STEP3: get {}'.format(final_url))

        res = self.sess.get(final_url)
        uuid_pa = r'"uniqueid":"(.*?)"'
        uuid_res = re.findall(uuid_pa, res.text, re.S)[0]
        log.debug('STEP4:user_id: {}'.format(uuid_res))

        url = M['profile'].format(uuid_res)
        raw = self.sess.get(url)

        def get_config(raw_mark):
            _START = '<!-- $CONFIG -->'
            _END = '<!-- / $CONFIG -->'
            return raw_mark.split(_START)[1].split(_END)[0]

        user_config = get_config(raw.text)
        user_config = self.dump_person_config(user_config)
        helper.write_file(json.dumps(user_config), 'personal.txt')

        raw = self.bs4markup(raw.text)

        log.debug('STEP5: title : {}'.format(raw.find('title').text))
        abc.update_cfg('weibo.nickname',
                       raw.find('title').text.replace('的微博_微博', ''))
        log.info('[LOGIN:SUCCESS] {}({})'.format(cfg.get('weibo.nickname'),
                                                 username))

        self.dump_cookies(self.sess.cookies)