Пример #1
0
def get_encrypt_json(*, start_date: str, end_date: str,
                     keywords: List[List[str]], type: str, area: int,
                     cookies: str) -> Dict:
    pre_url_map = {
        'search': 'http://index.baidu.com/api/SearchApi/index?',
        'live': 'http://index.baidu.com/api/LiveApi/getLive?',
        'news': 'http://index.baidu.com/api/NewsApi/getNewsIndex?',
        'feed': 'http://index.baidu.com/api/FeedSearchApi/getFeedIndex?'
    }

    pre_url = pre_url_map[type]
    word_list = [[{
        'name': keyword,
        'wordType': 1
    } for keyword in keyword_list] for keyword_list in keywords]
    if type == 'live':
        request_args = {'word': json.dumps(word_list), 'region': area}
    else:
        request_args = {
            'word': json.dumps(word_list),
            'startDate': start_date.strftime('%Y-%m-%d'),
            'endDate': end_date.strftime('%Y-%m-%d'),
            'area': area
        }
    url = pre_url + urlencode(request_args)
    html = http_get(url, cookies)
    datas = json.loads(html)
    if datas['status'] == 10000:
        raise QdataError(ErrorCode.NO_LOGIN)
    if datas['status'] != 0:
        raise QdataError(ErrorCode.UNKNOWN)
    return datas
Пример #2
0
def http_get(url: str, cookies: str) -> str:
    """
        发送get请求, 程序中所有的get都是调这个方法
        如果想使用多cookies抓取, 和请求重试功能
        在这自己添加
    """
    _headers = headers.copy()
    _headers['Cookie'] = cookies
    try:
        response = requests.get(url, headers=_headers, timeout=5)
    except requests.Timeout:
        raise QdataError(ErrorCode.NETWORK_ERROR)
    if response.status_code != 200:
        raise QdataError(ErrorCode.NETWORK_ERROR)
    return response.text
Пример #3
0
def get_search_index(*,
                     keywords_list: List[List[str]],
                     start_date: str,
                     end_date: str,
                     cookies: str,
                     area: int = 0):
    if len(keywords_list) > 5:
        raise QdataError(ErrorCode.KEYWORD_LIMITED)
    # print(keywords_list)
    for start_date, end_date in common.get_time_range_list(
            start_date, end_date):
        encrypt_json = common.get_encrypt_json(start_date=start_date,
                                               end_date=end_date,
                                               keywords=keywords_list,
                                               type='search',
                                               area=area,
                                               cookies=cookies)
        encrypt_datas = encrypt_json['data']['userIndexes']
        uniqid = encrypt_json['data']['uniqid']

        key = common.get_key(uniqid, cookies)
        for encrypt_data in encrypt_datas:
            for kind in ALL_KIND:
                encrypt_data[kind]['data'] = common.decrypt_func(
                    key, encrypt_data[kind]['data'])
            for formated_data in format_data(encrypt_data):
                yield formated_data
def get_extended_index(
    *,
    keywords_list: List[List[str]],
    start_date: str,
    end_date: str,
    cookies: str,
    area: int,
    type: str
):
    if len(keywords_list) > 5:
        raise QdataError(ErrorCode.KEYWORD_LIMITED)
    for start_date, end_date in common.get_time_range_list(start_date, end_date):
        encrypt_json = common.get_encrypt_json(
            start_date=start_date,
            end_date=end_date,
            keywords=keywords_list,
            type=type,
            area=area,
            cookies=cookies
        )
        encrypt_datas = encrypt_json['data']['index']
        uniqid = encrypt_json['data']['uniqid']

        key = common.get_key(uniqid, cookies)
        for encrypt_data in encrypt_datas:
            encrypt_data['data'] = common.decrypt_func(key, encrypt_data['data'])
            for formated_data in format_data(encrypt_data):
                formated_data['type'] = type
                yield formated_data
Пример #5
0
def get_live_search_index(*,
                          keywords_list: List[List[str]],
                          cookies: str,
                          area: int = 0):
    if len(keywords_list) > 5:
        raise QdataError(ErrorCode.KEYWORD_LIMITED)
    encrypt_json = common.get_encrypt_json(
        start_date='',
        end_date='',
        keywords=keywords_list,
        type='live',
        area=area,
        cookies=cookies,
    )
    encrypt_datas = encrypt_json['data']['result']
    uniqid = encrypt_json['data']['uniqid']
    key = common.get_key(uniqid, cookies)
    for encrypt_data in encrypt_datas:
        keyword = [
            keyword_info['name'] for keyword_info in encrypt_data['key']
        ]
        if area != 0:
            encrypt_data = encrypt_data['index'][str(area)]
        else:
            encrypt_data = encrypt_data['index'][0]
        for kind in ALL_KIND:
            encrypt_data[kind] = common.decrypt_func(key, encrypt_data[kind])
        for formated_data in format_data(encrypt_data, keyword):
            yield formated_data
Пример #6
0
def get_cookie_by_qr_login() -> str:
    print("扫完码记得关闭弹出的图片框...")

    try:
        qrcode_link, sign, callback = get_qrcode_info()
        show_qrcode(qrcode_link)
    except Exception:
        raise QdataError(ErrorCode.GET_QR_FAIL)

    try:
        bduss = get_bduss(sign, callback)
        cookies = get_login_cookie(bduss)
    except Exception:
        raise QdataError(ErrorCode.LOGIN_FAIL)

    try:
        cookies = cookies + get_exin()
    except Exception:
        raise QdataError(ErrorCode.INDEX_LOGIN_FAIL)

    return cookies
Пример #7
0
def get_exin() -> str:
    """
    拿恶心的东西
    """
    url = "https://miao.baidu.com/abdr"
    resp = session.post(url, data=EXIN_TOKEN, headers=HEADERS)
    resp_data = json.loads(resp.text)
    if isinstance(resp_data['data'], dict):
        return "; __yjsv5_shitong={}_{}_{}_{}_{}_{}_{}".format(
            resp_data['data']['ver'],
            resp_data['key_id'],
            resp_data['data']['lid'],
            resp_data['data']['ret_code'],
            resp_data['data']['server_time'],
            resp_data['data']['ip'],
            resp_data['sign']
        )
    elif isinstance(resp_data['data'], str):
        __yjs_st = b64encode(quote("_".join([resp_data['data'], resp_data['key_id'], resp_data['sign']])).encode()).decode()
        return "; __yjs_st=2_{}".format(__yjs_st)
    else:
        raise QdataError(ErrorCode.LOGIN_FAIL)
Пример #8
0
def get_company_count(*,
                      area_code: List[str] = None,
                      category: List[str] = None,
                      reg_capital_range: List[Tuple[int, int]] = None,
                      establish_time_range: List[Tuple[int, int]] = None,
                      reg_status: List[str] = None,
                      capital_unit: List[str] = None,
                      company_type: List[str] = None,
                      institution_type: List[str] = None,
                      staff_num_range: List[Tuple[int, int]] = None,
                      financing_round: List[str] = None,
                      listed_type: List[str] = None,
                      has_phone: bool = None,
                      has_mobile: bool = None,
                      has_email: bool = None,
                      has_brand: bool = None,
                      has_dishonest: bool = None,
                      has_website: bool = None,
                      has_chattel_mortage: bool = None,
                      has_copyright: bool = None,
                      has_soft_copyright: bool = None,
                      is_high_tech_company: bool = None,
                      is_tax_a_level: bool = None,
                      is_general_taxpayer: bool = None,
                      has_bid: bool = None) -> int:
    """
    area_code: 所在地区\n
    category: 行业分类\n
    reg_capital_range: 注册资本范围(万元)\n
    establish_time_range: 成立时间范围(毫秒)\n
    reg_status: 注册状态\n
    capital_unit: 资本类型\n
    company_type: 企业类型\n
    institution_type: 机构类型\n
    staff_num_range: 员工参保人数范围(人)\n
    financing_round: 融资轮次\n
    listed_type: 上市类型\n
    has_phone: 有无联系方式\n
    has_mobile: 有无手机号\n
    has_email: 有无邮箱\n
    has_brand: 有无商标\n
    has_dishonest: 有无失信\n
    has_website: 有无网址\n
    has_chattel_mortage: 有无动产抵押\n
    has_copyright: 有无作品著作\n
    has_soft_copyright: 有无软件著作\n
    is_high_tech_company: 是否是高新技术企业\n
    is_tax_a_level: 是否税务评级为A\n
    is_general_taxpayer: 是否为一般纳税人\n
    has_bid: 是否有招投标\n
    """
    if reg_capital_range:
        reg_capital_range = [
            num for num_tuple in reg_capital_range for num in num_tuple
        ]
    if establish_time_range:
        establish_time_range = [
            num for num_tuple in establish_time_range for num in num_tuple
        ]
    if staff_num_range:
        staff_num_range = [
            num for num_tuple in staff_num_range for num in num_tuple
        ]
    query = {
        "areaCodeSet": area_code,
        "categoryGuobiao2017Set": category,
        "regCapitalRangeSet": reg_capital_range,
        "establishTimeRangeSet": establish_time_range,
        "regStatusSet": reg_status,
        "capitalUnitSet": capital_unit,
        "companyTypeSet": company_type,
        "institutionTypeSet": institution_type,
        "staffNumRangeSet": staff_num_range,
        "financingRoundList": financing_round,
        "listedTypeSet": listed_type,
        "hasPhone": has_phone,
        "hasMobile": has_mobile,
        "hasEmail": has_email,
        "hasBrand": has_brand,
        "hasDishonest": has_dishonest,
        "hasWebSite": has_website,
        "hasChattelMortage": has_chattel_mortage,
        "hasCopyright": has_copyright,
        "hasSoftCopyright": has_soft_copyright,
        "isHighTechCompany": is_high_tech_company,
        "taxLevel": is_tax_a_level,
        "isGeneralTaxpayer": is_general_taxpayer,
        "hasBid": has_bid
    }

    final_query = {"searchType": 2}
    for key, value in query.items():
        if value is None:
            continue
        if isinstance(value, bool):
            final_query[key] = str(int(value))
        else:
            final_query[key] = value

    url = "https://capi.tianyancha.com/cloud-tempest/advance"
    try:
        resp = requests.post(url, json=final_query, headers=headers)
    except Exception:
        raise QdataError(ErrorCode.TYC_COMPANY_COUNT_FAIL)
    resp_data = json.loads(resp.text)
    return int(resp_data['data']['realTotal'])