Exemplo n.º 1
0
    def logic_detail(self, tag, status):
        """
        搜索页面后的处理逻辑
        :param tag:  搜索的类型(姓名的搜索,添加条件的搜索,翻页的请求)
        :param status: 搜索返回的状态(int: 正常返回, False: 账号需要打码, True: 没有搜索到结果, None:可能是网络问题或其他未知异常)
        :return: 搜索到的公司数量或者False或者None
        """
        if str(status).isdigit():
            # 正常请求
            account_results_coll.update_one(
                {'_id': self.account_info['account_id']},
                {'$set': {
                    "flag": 10,
                    'usable': 0
                }})
            # self.cookie_to_redis()
        elif status is False:
            # 账号需要打码
            self.log.info('需要打码...')
            # self.driver.set_window_size(900, 1000)
            self.image_handle()
            while True:
                try:
                    # WebDriverWait(self.driver, 20, 0.5).until(EC.presence_of_element_located((By.ID, "search")))
                    WebDriverWait(self.driver, 30, 0.5).until(
                        EC.presence_of_element_located(
                            (By.ID, "header-company-search")))

                    break
                    # self.driver.find_element_by_id('search')
                except Exception:
                    if ('我们只是确认一下你不是机器人' in self.driver.page_source
                        ) or '登录/注册' in self.driver.page_source:
                        # 打码失败
                        updateTime = str(int(time.time() * 1000))
                        account_results_coll.update_one(
                            {'_id': self.account_info['account_id']},
                            {'$set': {
                                "usable": 1,
                                'updateTime': updateTime
                            }})
                        self.log.error('账号不可用')
                        break
                    continue
            # self.image_handle()
            # self.cookie_to_redis()
        elif status is True:
            pass
        else:
            pass
    def parse_and_get_list_company(self, log):
        '''
        获取列表页的公司数据
        :return:
        '''
        """另一种策略,不具体定位class里面的值,因为class里面的值会变,所以定位到标签,再用正则做匹配"""
        try:
            # 获取页面html
            html = B(self.driver.page_source, 'html.parser')
            # 找到所有的公司外层的模块
            div_lists = html.find_all('div',
                                      attrs={'data-id': re.compile('\d+')})
        except Exception as e:
            log.error('找不到所有的公司外层的模块{}'.format(e))
            account_results_coll.update_one({'_id': self.account_id},
                                            {'$set': {
                                                "flag": 0
                                            }})
            return

        items = []

        for div in div_lists:
            dic = {}
            try:
                tmp = div.find(
                    'a',
                    attrs={
                        'href':
                        re.compile('https://www.tianyancha.com/company/\d+')
                    })
                # 公司名称
                dic['companyName'] = tmp.get_text(strip=True)
                # 公司url
                dic['companyUrl'] = tmp.attrs['href']
            except Exception as e:
                log.error(e)
                continue
            # 经营状态
            try:
                dic['businessState'] = tmp.next_sibling.get_text(strip=True)
            except Exception:
                dic['businessState'] = ''
            # 公司所属省份
            try:
                dic['companyProvince'] = div.contents[3].get_text(strip=True)
            except Exception:
                dic['companyProvince'] = ''

            # 法人/注册资本/注册时间/联系电话/邮箱/法人信息
            tags = div.contents[2].contents[1:-1]
            data = []
            for tag in tags:
                for _ in tag.contents:
                    data.append(_.get_text(strip=True))
            # 对初步解析的文本进一步分割
            tmp_dic = {
                '法定代表人': 'legalMan',
                '代表人': 'representMan',
                '负责人': 'chargeMan',
                '注册资本': 'registerMoney',
                '资本总额': 'registerMoney',
                '注册时间': 'registerTime',
                '联系电话': 'companyTel',
                '邮箱': 'companyEmail',
            }
            for _ in data:
                try:
                    key, value = _.split(":")
                except Exception:
                    continue
                # 联系电话可能存在多个
                if key in ['法定代表人', '代表人', '负责人']:
                    # 法定代表人url
                    try:
                        dic['manUrl'] = div.find('a', attrs={
                            'title': value
                        }).attrs['href']
                    except Exception:
                        pass
                        # log.error('获取法人链接失败')
                if key in ['联系电话', '邮箱']:
                    try:
                        tel_lists = re.search('.*\[(.*)\].*',
                                              value.replace(
                                                  '\"',
                                                  '')).group(1).split(',')
                    except Exception:
                        tel_lists = [value]
                    dic[tmp_dic[key]] = tel_lists
                else:
                    try:
                        dic[tmp_dic[key]] = value
                    except Exception:
                        pass
            items.append(dic)
            # 数据存储
        return items if items else False
        """---------"""
    def handle_question(self, log):
        '''
        判断是否登录成功
        :return:
        '''
        try:
            if ('proxy' or '503' or '500') in self.driver.page_source:
                self.driver.refresh()
                sleep(3)
            #如果跳转到首页即登录成功
            home_pages = self.driver.find_element_by_xpath(
                "//input[@id='home-main-search']")
            if home_pages:
                #提前打标记
                account_results_coll.update_one({'_id': self.account_id},
                                                {'$set': {
                                                    "flag": 1
                                                }})
                self.flag = False
                return
        except Exception as e:
            # 如果找到登录页面元素,即账号不可用,登录失败
            try:
                no_use = self.driver.find_element_by_xpath(
                    "//div[@class='pb30 position-rel']/input")
                if no_use:
                    # 提前打标记
                    # 标记为不可用,下次不再取出
                    account_results_coll.update_one({'_id': self.account_id},
                                                    {'$set': {
                                                        "usable": 1
                                                    }})
                    return
            except Exception as e:
                # 获取列表页所有详情urls
                try:
                    tmp = self.driver.find_element_by_xpath(
                        '//span[contains(text(),"天眼查为你找到")]/..').text
                    match = int(re.search(r'天眼查为你找到(\d+)家公司', tmp).group(1))
                except Exception as e:
                    log.error(e)
                    match = 0
                if match:
                    self.parse_company_pages(log)
                    self.check_count = 0
                #都找不到说明没有进入列表页
                else:
                    try:
                        self.driver.find_element_by_xpath(
                            "//div[@class='container']//div[@class='content']")
                        # self.image_handle(log)
                        # 提前打标记, 标记需要打码
                        updateTime = str(int(time.time() * 1000))
                        account_results_coll.update_one(
                            {'_id': self.account_id},
                            {'$set': {
                                "flag": 1,
                                'updateTime': updateTime
                            }})
                        self.flag = True
                        self.login(log)
                    # 如果出现检索条件过大,或者账号暂时不可用
                    except Exception as e:
                        if '普通用户可查看100家公司,VIP会员可查看5000家公司' in self.driver.page_source:
                            # 提前打标记
                            account_results_coll.update_one(
                                {'_id': self.account_id},
                                {'$set': {
                                    "flag": 0
                                }})
                            # self.search_name_from_mongo()
                            # self.get_name_to_search()
                        else:
                            try:
                                error_str = self.driver.find_element_by_xpath(
                                    "//div[@class='f24 mb40 mt40 sec-c1 ']"
                                ).text
                                if error_str == "抱歉,没有找到相关结果!":
                                    log.info("[ERROR]: 抱歉,没有找到相关结果!")
                                    # 继续搜索下一个名字
                                    # 更新名字数量
                                    name_results_coll.update_one(
                                        {'_id': self._id}, {
                                            '$set': {
                                                "name_num": 0,
                                                "company_numm": 0,
                                                'flag': 4
                                            }
                                        })
                                    self.check_count += 1
                                    if self.check_count >= 4:
                                        updateTime = str(
                                            int(time.time() * 1000))
                                        account_results_coll.update_one(
                                            {'_id': self.account_id}, {
                                                '$set': {
                                                    "flag": 0,
                                                    'updateTime': updateTime
                                                }
                                            })
                                        self.flag = True
                                        self.login(log)

                                    # 更新公司数量
                                    # self.db.name_results.update_one({'_id': self._id},{'$set': {"company_numm": 0}})
                                    # self.get_name_to_search()
                            except Exception as e:
                                # 匹配错误提示信息
                                error_info = self.driver.find_element_by_xpath(
                                    "/html/body/div/div[1]").text
                                if error_info == "系统检测到您非人类行为,己被禁止访问天眼查,若有疑问请联系官方qq群 515982002":
                                    log.info("[ERROR]: {}".format(error_info))
                                    updateTime = str(int(time.time() * 1000))
                                    account_results_coll.update_one(
                                        {'_id': self.account_id}, {
                                            '$set': {
                                                "flag": 0,
                                                'updateTime': updateTime
                                            }
                                        })
                                    self.flag = True
                                    self.login(log)
Exemplo n.º 4
0
    def login(self):
        """
        登陆模块
        :return:
        """
        # 需要重新登陆标记位
        status = False
        # 获取账号信息
        while not status:
            try:
                # 设置窗口大小
                self.driver.set_window_size(900, 1000)
                self.driver.get(self.login_url)
                sleep(2)
                self.log.info('正在使用账号: {}'.format(
                    self.account_info.get('account_name')))
                cookies = self.get_cookie_from_redis()
                if cookies:
                    self.driver.delete_all_cookies()
                    for _ in cookies:
                        self.driver.add_cookie(_)
                    self.driver.get("https://www.tianyancha.com/")
                else:
                    # 点击密码登陆
                    self.driver.find_element_by_xpath(
                        "//div[@onclick='changeCurrent(1);']").click()
                    # js填充账号和密码
                    js_str = """document.querySelector(".mobile_box .contactphone").value = '{}';document.querySelector(".mobile_box .contactword").value = '{}';document.querySelectorAll('.btn.-hg.btn-primary.-block')[2].click()""".format(
                        self.account_info['account_name'],
                        self.account_info['password'])
                    self.driver.execute_script(js_str)
                if '手机号码密码错误' in self.driver.page_source or '我们只是确认一下你不是机器人' in self.driver.page_source:
                    # 打码失败
                    updateTime = str(int(time.time() * 1000))
                    account_results_coll.update_one(
                        {'_id': self.account_info['account_id']},
                        {'$set': {
                            "usable": 1,
                            'updateTime': updateTime
                        }})
                    # 账号信息
                    self.account_info = self.get_account_from_redis()
                    status = False
                else:
                    # 检测是否登陆成功
                    status = self.check_question('login')
            except Exception as e:
                # try:
                #     self.log.error("页面加载超时, ERROR:{}".format(e))
                #     if 'CONNECTION' in self.driver.page_source or ('timeout' in str(e)):
                #         # 如果是网络问题或者超时,则重新更换代理IP然后重启driver
                #         self.driver.close()
                #         self.driver.quit()
                #         # 切换代理
                #         self.switch_ip()
                #         # 实例化driver
                #         self.chrome_driver()
                #         sleep(5)
                #     elif '天眼查' not in self.driver.page_source:
                #         # 重新拨号
                #         self.dial_ip()
                #     else:
                #         self.driver.refresh()
                # except Exception as e:
                #     self.log.error(e)
                self.log.error("页面加载超时, ERROR:{}".format(e))
                try:
                    self.driver.refresh()
                except Exception as e:
                    self.log.error(e)
                # 如果是网络问题或者超时,则重新更换代理IP然后重启driver
                self.driver.close()
                self.driver.quit()
                # 切换代理
                self.switch_ip()
                # 实例化driver
                self.chrome_driver()
                sleep(5)
                status = False
                continue

            # 修改数据库状态,如果status==True表示登陆成功,将标记为改位flag:1,否则表示登陆失败,账号不可用,将usable改位1
            condition = {'_id': self.account_info['account_id']}
            update_tag = 'account'
            query = {
                '$set': {
                    'flag': 1
                }
            } if status else {
                '$set': {
                    'usable': 1
                }
            }
            self.update_database_status(update_tag, condition, query)