Exemplo n.º 1
0
    async def get_8xs_some_label_all_video_list_by_label_name(
            self, label_name='大陆') -> list:
        """
        根据label_name 获取指定的所有视频信息
        :param label_name:
        :return:
        """
        def get_tasks_params_list():
            tasks_params_list = []
            sort_name_dict = self.get_8xs_all_sort_label_name_dict(
                sort_name=label_name)
            max_page_num = sort_name_dict.get('max_page_num', 0)
            sort_type = sort_name_dict.get('sort_type', '')
            assert sort_type != ''

            for page_num in range(1, max_page_num + 1):
                tasks_params_list.append({
                    'page_num': page_num,
                    'label_name': label_name,
                    'sort_type': sort_type,
                })

            return tasks_params_list

        def get_create_task_msg(k) -> str:
            return 'create task[where type: {}, page_num: {}] ...'.format(
                k['label_name'],
                k['page_num'],
            )

        def get_now_args(k) -> list:
            return [
                k['label_name'],
                k['sort_type'],
                k['page_num'],
            ]

        all_res = await get_or_handle_target_data_by_task_params_list(
            loop=self.loop,
            tasks_params_list=get_tasks_params_list(),
            func_name_where_get_create_task_msg=get_create_task_msg,
            func_name=self.get_8xs_video_list_by_label_name_and_page_num,
            func_name_where_get_now_args=get_now_args,
            func_name_where_handle_one_res=None,
            func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res,
            one_default_res=[],
            step=self.concurrency,
            logger=self.lg,
            get_all_res=True,
            concurrent_type=self.concurrent_type,
        )
        pprint(all_res)
        print('获取{} video_all_res_num: {}'.format(label_name, len(all_res)))
        all_res = list_remove_repeat_dict_plus(
            target=all_res,
            repeat_key='video_name',
        )
        print('实际获取{} video_all_res_num: {}'.format(label_name, len(all_res)))

        return all_res
Exemplo n.º 2
0
def _get_yw_one_type_company_id_list_task(self, ip_pool_type, keyword, page_num, timeout=15):
    """
    获取yw某关键字的单页company_info(m 站)
    :param self:
    :param ip_pool_type:
    :param keyword:
    :param page_num:
    :param timeout:
    :return:
    """
    headers = get_random_headers(
        user_agent_type=1,
        cache_control='',)
    headers.update({
        # 'x-csrf-token': 'v8N2st76hSgzPPYQ-1DYgqOh',
        # 'Referer': 'http://wap.yiwugo.com/search?q=%E5%8E%8B%E7%BC%A9%E6%9C%BA',
        'X-Requested-With': 'XMLHttpRequest',
    })
    params = (
        ('q', str(keyword)),
        ('cpage', str(page_num)),
        ('pageSize', '28'),
        ('st', '0'),
        ('m', ''),
        ('f', ''),
        ('s', ''),
    )
    s_url = 'http://wap.yiwugo.com/api/search/s.htm'
    body = Requests.get_url_body(
        url=s_url,
        headers=headers,
        params=params,
        ip_pool_type=ip_pool_type,
        num_retries=6,
        timeout=timeout,)
    # lg.info(body)
    data = json_2_dict(
        json_str=body,
        default_res={},
        logger=lg).get('prslist', [])
    # lg.info(str(data))
    company_info_list = [{
        'company_id': item.get('shopUrlId', ''),
        'company_name': item.get('shopName', ''),
    } for item in data]
    company_info_list = list_remove_repeat_dict_plus(target=company_info_list, repeat_key='company_id')
    # lg.info(str(company_info_list))

    lg.info('[{}] keyword: {}, page_num: {}'.format(
        '+' if company_info_list != [] else '-',
        keyword,
        page_num,))
    collect()

    return company_info_list
Exemplo n.º 3
0
def get_bd_shop_info():
    ak = get_ak()
    # ak = ''

    # 百度api 关键字搜索信息
    tmp_shop_list = get_bd_map_shop_info_list_by_keyword_and_area_name(
        ak=ak,
        keyword='官方',
        area_name='杭州',
        page_num=2,
        ip_pool_type=tri_ip_pool,
        timeout=15,
        num_retries=8,)
    # pprint(tmp_shop_list)

    shop_info_list = []
    for item in tmp_shop_list:
        try:
            phone = item.get('telephone', '')
            assert phone != '', 'phone不为空str!'
            phone = [{
                'phone': item.replace('(', '').replace(')', ''),
            } for item in phone.split(',')]
            address = item.get('address', '')
            assert address != '', 'address不为空str'
            company_name = item.get('name', '')
            assert company_name != '', 'company_name不为空str!'
            city_name = item.get('city', '')
            assert city_name != '', 'city_name != ""'
            province_name = item.get('province', '')
            assert province_name != '', 'province_name != ""'
            company_id = item.get('uid', '')
            assert company_id != '', 'company_id != ""'
            lat = item.get('location', {}).get('lat', 0.)
            lng = item.get('location', {}).get('lng', 0.)
            assert lat != 0. or lng != 0., 'lat or lng异常!'
        except AssertionError:
            continue

        shop_info_list.append({
            'company_id': company_id,
            'company_name': company_name,
            'address': address,
            'city_name': city_name,
            'province_name': province_name,
            'phone': phone,
            'lat': lat,
            'lng': lng,
        })

    shop_info_list = list_remove_repeat_dict_plus(
        target=shop_info_list,
        repeat_key='company_id',)
    pprint(shop_info_list)
Exemplo n.º 4
0
    def _parse_someone_cate_id_api_info(
        self,
        target_list: list,
        _type='m',
    ) -> list:
        """
        解析m站, pc站接口的信息
        :param target_list:
        :return:
        """
        # pprint(target_list)
        res = []
        for item in target_list:
            try:
                title = item.get('title', '')
                assert title != '', 'title != ""'
                share_num = item.get('share_num', '0')
                assert share_num != '0', "share_num != '0'"
                id = item.get('id', )
                assert id is not None, 'id is not None'

                if _type == 'm':
                    read_num = item.get('read_num', '0')
                elif _type == 'pc':
                    read_num = item.get('read_sum', '0')
                else:
                    raise NotImplemented
                assert read_num != '0', "read_num != '0'"

            except (AssertionError, Exception) as e:
                # print('遇到错误:', e)
                continue

            res.append({
                'id':
                id,
                'title':
                title,
                'read_num':
                read_num,
                'share_num':
                share_num,
                'url':
                'https://focus.youth.cn/mobile/detail/id/{}#'.format(id)
            })

        res = list_remove_repeat_dict_plus(
            target=res,
            repeat_key='title',
        )
        # 按阅读数正序排列
        # res = sorted(res, key=lambda item: int(item.get('read_num', '0')), reverse=True)

        return res
Exemplo n.º 5
0
    async def get_s69_all_chinese_captions_video_list(self) -> list:
        """
        获取所有中文字幕的video list
        :return:
        """
        def get_tasks_params_list():
            tasks_params_list = []
            for page_num in range(1,
                                  self.max_s63_chinese_captions_page_num + 1):
                tasks_params_list.append({
                    'page_num': page_num,
                })

            return tasks_params_list

        def get_create_task_msg(k) -> str:
            return 'create task[where type: {}, page_num: {}] ...'.format(
                '中文字幕',
                k['page_num'],
            )

        def get_now_args(k) -> list:
            return [
                k['page_num'],
            ]

        all_res = await get_or_handle_target_data_by_task_params_list(
            loop=self.loop,
            tasks_params_list=get_tasks_params_list(),
            func_name_where_get_create_task_msg=get_create_task_msg,
            func_name=self.get_s69_chinese_captions_video_list_by_page_num,
            func_name_where_get_now_args=get_now_args,
            func_name_where_handle_one_res=None,
            func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res,
            one_default_res=[],
            step=self.concurrency,
            logger=self.lg,
            get_all_res=True,
            concurrent_type=self.concurrent_type,
        )
        pprint(all_res)
        print('获取中文字幕video_all_res_num: {}'.format(len(all_res)))
        # 不采用request, 不管page_num怎么变都只拿到首页数据, 改用driver
        all_res = list_remove_repeat_dict_plus(
            target=all_res,
            repeat_key='video_name',
        )
        print('实际获取中文字幕video_all_res_num: {}'.format(len(all_res)))

        return all_res
Exemplo n.º 6
0
    async def _search(self, k: str) -> list:
        """
        搜索
        :param k:
        :return:
        """
        # res = await self._search_by_finer(k=k)
        res = await self._search_by_wkb_and_lyoo(k=k)

        res = list_remove_repeat_dict_plus(
            target=res,
            repeat_key='question_desc',)
        pprint(res)
        self.lg.info('本次获取到k: {}, 结果数: {}'.format(
            k,
            len(res),
        ))

        return res
Exemplo n.º 7
0
def _get_bd_or_gd_one_type_company_info_list_task(self,
                                                  ak:str,
                                                  keyword,
                                                  area_name,
                                                  page_num:int,
                                                  ip_pool_type,
                                                  timeout=15,
                                                  num_retries=8,
                                                  map_type='bd') -> list:
    """
    获取bd or gd的商家信息
    :param self:
    :param ak: 申请的ak or key码
    :param keyword: eg: '鞋子'
    :param area_name: eg: '金华市'
    :param page_num:
    :param ip_pool_type:
    :param timeout:
    :param num_retries:
    :param map_type: 'bd' or 'gd'
    :return:
    """
    def get_bd() -> list:
        """bd"""
        # 百度api 关键字搜索信息
        tmp_shop_list = get_bd_map_shop_info_list_by_keyword_and_area_name(
            ak=ak,
            keyword=keyword,
            area_name=area_name,
            page_num=page_num,
            ip_pool_type=ip_pool_type,
            timeout=timeout,
            num_retries=num_retries,
            logger=lg,)
        # pprint(tmp_shop_list)

        shop_info_list = []
        for item in tmp_shop_list:
            try:
                phone = item.get('telephone', '')
                assert phone != '', 'phone不为空str!'
                phone = [{
                    'phone': item.replace('(', '').replace(')', ''),
                } for item in phone.split(',')]
                address = item.get('address', '')
                assert address != '', 'address不为空str'
                company_name = item.get('name', '')
                assert company_name != '', 'company_name不为空str!'
                city_name = item.get('city', '')
                assert city_name != '', 'city_name != ""'
                province_name = item.get('province', '')
                assert province_name != '', 'province_name != ""'
                company_id = item.get('uid', '')
                assert company_id != '', 'company_id != ""'
                lat = item.get('location', {}).get('lat', 0.)
                lng = item.get('location', {}).get('lng', 0.)
                assert lat != 0. or lng != 0., 'lat or lng异常!'
            except AssertionError:
                continue

            shop_info_list.append({
                'company_id': company_id,
                'company_name': company_name,
                'address': address,
                'city_name': city_name,
                'province_name': province_name,
                'phone': phone,
                'lat': lat,
                'lng': lng,
            })

        return shop_info_list

    def get_gd() -> list:
        """gd"""
        tmp_shop_info_data = get_gd_map_shop_info_list_by_keyword_and_area_name(
            gd_key=ak,
            keyword=str(keyword),
            area_name=str(area_name),
            page_num=page_num,
            ip_pool_type=ip_pool_type,
            num_retries=num_retries,
            timeout=timeout,
            logger=lg,)

        shop_info_list = []
        for item in tmp_shop_info_data:
            try:
                company_id = item.get('id', '')
                assert company_id != '', 'company_id != ""'
                company_name = item.get('name', '')
                assert company_name != '', 'company_name != ""'
                address = item.get('address', '')
                assert address != '', "address != ''"
                city_name = item.get('cityname', '')
                assert city_name != '', 'city_name != ""'
                province_name = item.get('pname', '')
                assert province_name != '', 'province_name != ""'
                phone = item.get('tel', '')
                assert phone != '', 'phone != ""'
                phone = [{
                    'phone': item,
                } for item in phone.split(';')]
                location = item.get('location', '')
                assert location != '', 'localtion != ""'
                # 经度
                lng = float(location.split(',')[0])
                # 纬度
                lat = float(location.split(',')[1])
            except (AssertionError, Exception):
                continue

            shop_info_list.append({
                'company_id': company_id,
                'company_name': company_name,
                'address': address,
                'city_name': city_name,
                'province_name': province_name,
                'phone': phone,
                'lat': lat,
                'lng': lng,
            })

        return shop_info_list

    if map_type == 'bd':
        shop_info_list = get_bd()

    elif map_type == 'gd':
        shop_info_list = get_gd()

    else:
        raise NotImplementedError

    shop_info_list = list_remove_repeat_dict_plus(
        target=shop_info_list,
        repeat_key='company_id',)
    # pprint(shop_info_list)

    lg.info('[{}] keyword:{}, page_num:{}, area_name: {}'.format(
        '+' if shop_info_list != [] else '-',
        keyword,
        page_num,
        area_name,))

    return shop_info_list
Exemplo n.º 8
0
def _get_gt_one_type_company_id_list_task(self,
                                          ip_pool_type,
                                          keyword,
                                          company_url_selector:dict,
                                          company_id_selector:dict,
                                          page_num,
                                          num_retries=8,
                                          timeout=15,) -> list:
    """
    根据keyword获取gt单页的所有comapny_id list
    :param self:
    :param ip_pool_type:
    :param keyword:
    :param page_num:
    :param num_retries:
    :param timeout:
    :return:
    """
    # search
    headers = get_random_headers()
    headers.update({
        # 'Referer': 'http://www.go2.cn/search/all/?category_id=all&search_1=1&q=%E9%9E%8B%E5%AD%90',
        'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
    })
    params = (
        ('category_id', 'all'),
        ('search_1', '1'),
        ('q', str(keyword)),
    )
    url = 'http://www.go2.cn/search/all/page{}.html'.format(page_num)
    body = Requests.get_url_body(
        url=url,
        headers=headers,
        params=params,
        ip_pool_type=ip_pool_type,
        num_retries=num_retries,
        timeout=timeout,)
    # lg.info(body)

    company_url_list = parse_field(
        parser=company_url_selector,
        target_obj=body,
        is_first=False,
        logger=lg,)
    # pprint(company_url_list)

    res = []
    for item in company_url_list:
        try:
            company_id = parse_field(
                parser=company_id_selector,
                target_obj=item,
                logger=lg,)
            assert company_id != '', 'company_id不为空值!'
        except AssertionError:
            continue
        res.append({
            'company_id': company_id,
        })

    res = list_remove_repeat_dict_plus(
        target=res,
        repeat_key='company_id',)
    lg.info('[{}] keyword: {}, page_num: {}'.format(
        '+' if res != [] else '-',
        keyword,
        page_num,))

    return res
Exemplo n.º 9
0
def _get_pk_one_type_company_id_list_task(self,
                                          ip_pool_type,
                                          keyword:str,
                                          page_num,
                                          province_name,
                                          city_name,
                                          city_id,
                                          w3,
                                          num_retries=6,
                                          timeout=15,) -> list:
    """
    获取pk单个关键字单页的company_id_list
    :param self:
    :param ip_pool_type:
    :param keyword:
    :param page_num:
    :param province_name:
    :param city_name:
    :param city_id:
    :param w3:
    :param num_retries:
    :param timeout:
    :return:
    """
    headers = get_random_headers(
        user_agent_type=1,
        connection_status_keep_alive=False,
        cache_control='',)
    headers.update({
        'accept': 'application/json, text/plain, */*',
        'Origin': 'https://m.ppkoo.com',
    })
    params = (
        # ('cid', '50000436'),          # 根据keywords索引的话, cid可不传
        ('keywords', keyword),
        ('hot', 'desc'),
        ('page', str(page_num)),
        ('city_id', str(city_id)),
        # ('v', '3784143914913054'),
    )
    url = 'https://www.ppkoo.com/api/Search/goods'
    body = Requests.get_url_body(
        url=url,
        headers=headers,
        params=params,
        ip_pool_type=ip_pool_type,
        num_retries=num_retries,
        timeout=timeout,)
    # 存在: {"status":true,"total":"0","data":null}
    # lg.info(body)

    data = json_2_dict(
        json_str=body,
        default_res={},
        logger=lg).get('data', [])
    # pprint(data)
    if data is None:
        # 处理null的赋值情况
        data = []
    else:
        pass

    company_info_list = []
    for item in data:
        try:
            company_id = item.get('business_id', '')
            assert company_id != ''
            address = item.get('shop_location', '')
            assert address != ''
            company_info_list.append({
                'company_id': company_id,
                'province_name': province_name,
                'city_name': city_name,
                'w3': w3,
                'address': address,                 # 此处即可获取详细地址!
            })
        except Exception:
            continue

    company_info_list = list_remove_repeat_dict_plus(target=company_info_list, repeat_key='company_id')
    # pprint(company_info_list)
    lg.info('[{}] keyword: {}, page_num: {}'.format(
        '+' if company_info_list != [] else '-',
        keyword,
        page_num,))
    collect()

    return company_info_list
Exemplo n.º 10
0
def _get_hn_one_type_company_id_list_task(self,
                                          ip_pool_type,
                                          keyword, page_num,
                                          province_name,
                                          city_name,
                                          city_base_url,
                                          shop_item_selector:dict,
                                          shop_id_selector:dict,
                                          w3_selector:dict,
                                          num_retries=6,
                                          timeout=15,):
    """
    获取hn 某关键字的单页company_info
    :param self:
    :param ip_pool_type:
    :param keyword:
    :param page_num:
    :param province_name:
    :param city_name:
    :param city_base_url:
    :param shop_item_selector:
    :param shop_id_selector:
    :param w3_selector:
    :param num_retries:
    :param timeout:
    :return: [{'company_id': xxx, 'province_name': 'xx', 'city_name': 'xx', 'w3': 'xx'}, ...]
    """
    try:
        w3 = parse_field(
            parser=w3_selector,
            target_obj=city_base_url,
            logger=lg,)
        assert w3 != '', 'w3为空值!'
    except AssertionError:
        lg.error('遇到错误:', exc_info=True)
        return []

    headers = get_random_headers()
    headers.update({
        'Proxy-Connection': 'keep-alive',
    })
    params = (
        ('q', str(keyword)),
        ('sourcePage', '/'),
        ('page_no', str(page_num)),
    )
    # url = 'http://www.huoniuniu.com/goods'
    url = city_base_url + '/goods'
    body = Requests.get_url_body(
        url=url,
        headers=headers,
        params=params,
        num_retries=num_retries,
        ip_pool_type=ip_pool_type,
        timeout=timeout)
    # lg.info(body)

    shop_item_list = parse_field(
        parser=shop_item_selector,
        target_obj=body,
        is_first=False,
        logger=lg,)
    # pprint(shop_item_list)

    shop_id_list = []
    for item in shop_item_list:
        try:
            company_id = parse_field(
                parser=shop_id_selector,
                target_obj=item,
                is_first=True,
                logger=lg)
            assert company_id != '', 'company_id不为空值!'
            shop_id_list.append({
                'company_id': company_id,
                'province_name': province_name,
                'city_name': city_name,
                'w3': w3,
            })
        except AssertionError:
            continue
    shop_id_list = list_remove_repeat_dict_plus(
        target=shop_id_list,
        repeat_key='company_id',)
    # pprint(shop_id_list)
    lg.info('[{}] keyword: {}, page_num: {}, province_name: {}, city_name: {}'.format(
        '+' if shop_id_list != [] else '-',
        keyword,
        page_num,
        province_name,
        city_name,
    ))
    collect()

    return shop_id_list