def get_contributive_info_detail(self, session, data, text):
        url = 'http://{host}/gjjbjTab/gjjTabQueryCreditAction!gdczDetail.dhtml'.format(host=self.host)
        pattern = 'lookAjaxInfo\(\'(.*?)\',\'(.*?)\',\'(.*?)\'\)'
        regex = re.compile(pattern)
        a_list = PyQuery(text, parser='html').find('a').items()
        for a_item in a_list:
            tr_text = a_item.attr('onclick')
            if tr_text is None or tr_text == '':
                continue

            search_list = regex.findall(tr_text)
            if len(search_list) <= 0:
                continue

            post_data = {
                'ent_id': search_list[0][0],
                'chr_id': search_list[0][1],
                'ajax': True,
                'time': util.get_time_stamp(),
                'dateflag': search_list[0][2]
            }
            r = self.task_request(session, session.post, url, data=post_data)
            if r is None:
                self.append_model(data, Model.contributive_info, url, '',
                                  post_data=post_data, classify=Model.type_detail)
                continue
            self.append_model(data, Model.contributive_info, url, r.text,
                              post_data=post_data, classify=Model.type_detail)
Exemplo n.º 2
0
    def get_annual_info(self, session, i_d, data):
        url = 'http://{host}/api/PubAnnualInfo/Annuals/{id}?_={rand}'.format(
            host=self.host, id=i_d, rand=util.get_time_stamp())
        r = self.task_request(session, session.get, url)
        if r is None:
            return

        json_data = util.json_loads(r.text)
        if json_data is None:
            return

        data_list = json_data.get('data', None)
        if data_list is None:
            return

        for item in data_list:
            anche_id = item.get('ancheId', None)
            year_info = item.get('year', None)
            if anche_id is None or year_info is None:
                continue

            year_list = re.findall('(\d+)', year_info)
            if len(year_list) <= 0:
                continue
            year = year_list[0]

            # 获得详细年报信息
            self.get_annual_detail_info(session, i_d, anche_id, year, data)
Exemplo n.º 3
0
 def get_base_info(self, session, i_d):
     url = 'http://{host}/api/PubBaseInfo/Business/{id}?_={rand}'.format(
         host=self.host, id=i_d, rand=util.get_time_stamp())
     r = self.task_request(session, session.get, url)
     if r is None:
         return
     return url, r.text
    def get_contributive_info_detail(self, session, text, data):
        try:
            json_data = util.json_loads(text)
            if json_data is None:
                return

            json_list = json_data[0].get('list', None)
            if json_list is None:
                return

            for index, item in enumerate(json_list):
                invid = item.get('invid', None)
                if invid is None:
                    continue

                url = 'http://{host}/gsxt/api/einv/gdxx/{invid}?currentpage=1&pagesize=5&t={rand}'.format(
                    host=self.host, invid=invid, rand=util.get_time_stamp())
                r = self.task_request(session, session.get, url)
                if r is not None:
                    self.append_model(data,
                                      Model.contributive_info,
                                      url,
                                      r.text,
                                      classify=Model.type_detail)

                url = 'http://{host}/gsxt/api/einvpaidin/queryList/{invid}?currentpage=1&pagesize=5&t={rand}'.format(
                    host=self.host, invid=invid, rand=util.get_time_stamp())
                r = self.task_request(session, session.get, url)
                if r is not None:
                    self.append_model(data,
                                      Model.contributive_info,
                                      url,
                                      r.text,
                                      classify=Model.type_detail)

                url = 'http://{host}/gsxt/api/efactcontribution/queryList/{invid}?currentpage=1&pagesize=5&t={rand}'.format(
                    host=self.host, invid=invid, rand=util.get_time_stamp())
                r = self.task_request(session, session.get, url)
                if r is not None:
                    self.append_model(data,
                                      Model.contributive_info,
                                      url,
                                      r.text,
                                      classify=Model.type_detail)
        except Exception as e:
            self.log.exception(e)
Exemplo n.º 5
0
    def get_search_list_html(self, keyword, session):
        param_list = []
        try:
            url = 'http://{host}/ztxy.do?method=index&random={rand}'.format(
                host=self.host, rand=util.get_time_stamp())
            content = self.get_captcha_geetest(url, '#entname', '#popup-submit', keyword,
                                               'p.result_desc', origin_session=session)
            if content is None:
                return param_list, self.SEARCH_ERROR

            # pripid,enttype,zt,type
            # openView('6100000000020342','11','K','1')
            jq = PyQuery(content, parser='html')
            if jq.find('p.result_desc').text().find('您搜索的条件无查询结果') != -1:
                return param_list, self.SEARCH_NOTHING_FIND

            pattern = 'openView\(\'(.*?)\',\'(.*?)\',\'(.*?)\',\'(.*?)\'\)'
            regex = re.compile(pattern)
            item_list = jq.find('.result_item').items()
            for item in item_list:
                onclick = item.attr('onclick')
                if onclick is None or onclick == '':
                    continue

                search_list = regex.findall(onclick)
                if len(search_list) <= 0:
                    continue

                company = item.find('#mySpan').attr('title')
                if company is None or company == '':
                    continue

                search_name = company.replace(' ', '')
                if search_name == '':
                    continue

                status = item.find('.status.diaoxiao').text()
                if status is None or status == '':
                    status = item.find('.status.cunxu').text()

                data = {
                    'pripid': search_list[0][0],
                    'enttype': search_list[0][1],
                    'zt': search_list[0][2],
                    'type': search_list[0][3],
                    'search_name': search_name,
                }
                if status is not None and status != '':
                    data['status'] = status

                param_list.append(data)
        except Exception as e:
            self.log.exception(e)
            return param_list, self.SEARCH_ERROR

        return param_list, self.SEARCH_SUCCESS if len(param_list) > 0 else self.SEARCH_ERROR
Exemplo n.º 6
0
    def get_branch_info(self, session, pri_pid, data):
        url = 'http://{host}/ztxy.do?method=showAllfzjg&maent.pripid={pripid}&random={rand}'.format(
            host=self.host, pripid=pri_pid, rand=util.get_time_stamp())

        r = self.task_request(session, session.get, url)
        if r is None:
            self.append_model(data, Model.branch_info, url, '', status=self.STATUS_FAIL)
            return

        self.append_model(data, Model.branch_info, url, r.text)
Exemplo n.º 7
0
    def get_annual_detail_info(self, session, i_d, anche_id, year, data):
        url = 'http://{host}/api/PubAnnualInfo/Annual/{id}/{anche_id}'.format(
            host=self.host, id=i_d, anche_id=anche_id)
        self.__get_annual_detail_info(session, url, year, data)

        url = 'http://{host}/api/PubAnnualInfo/AnWebSites/{id}/{anche_id}'.format(
            host=self.host, id=i_d, anche_id=anche_id)
        self.__get_annual_detail_info(session, url, year, data)

        url = 'http://{host}/api/PubAnnualInfo/AnForInvestments/{id}/{anche_id}'.format(
            host=self.host, id=i_d, anche_id=anche_id)
        self.__get_annual_detail_info(session, url, year, data)

        url = 'http://{host}/api/PubAnnualInfo/AnAsset/{id}/{anche_id}'.format(
            host=self.host, id=i_d, anche_id=anche_id)
        self.__get_annual_detail_info(session, url, year, data)

        url = 'http://{host}/api/PubAnnualInfo/AnUpdates/{id}/{anche_id}'.format(
            host=self.host, id=i_d, anche_id=anche_id)
        self.__get_annual_detail_info(session, url, year, data)

        url = 'http://{host}/api/PubAnnualInfo/AnSubCapitals/{id}/{anche_id}?_={rand}'.format(
            host=self.host,
            id=i_d,
            anche_id=anche_id,
            rand=util.get_time_stamp())
        self.__get_annual_detail_info(session, url, year, data)

        url = 'http://{host}/api/PubAnnualInfo/AnForGuarantees/{id}/{anche_id}?_={rand}'.format(
            host=self.host,
            id=i_d,
            anche_id=anche_id,
            rand=util.get_time_stamp())
        self.__get_annual_detail_info(session, url, year, data)

        url = 'http://{host}/api/PubAnnualInfo/AnAlterStocks/{id}/{anche_id}?_={rand}'.format(
            host=self.host,
            id=i_d,
            anche_id=anche_id,
            rand=util.get_time_stamp())
        self.__get_annual_detail_info(session, url, year, data)
Exemplo n.º 8
0
    def get_shareholder_info(self, session, i_d, data):
        url = 'http://{host}/api/PubSelfPubInfo/InvDetails/{id}?_={rand}'.format(
            host=self.host, id=i_d, rand=util.get_time_stamp())
        r = self.task_request(session, session.get, url)
        if r is None:
            self.append_model(data,
                              Model.shareholder_info,
                              url,
                              '',
                              status=self.STATUS_FAIL)
            return

        # 存储数据
        self.append_model(data, Model.shareholder_info, url, r.text)
Exemplo n.º 9
0
    def get_change_info(self, session, i_d, data):
        url = 'http://{host}/api/PubBaseInfo/BaseInfoAlters/{id}?_={rand}'.format(
            host=self.host, id=i_d, rand=util.get_time_stamp())
        r = self.task_request(session, session.get, url)
        if r is None:
            self.append_model(data,
                              Model.change_info,
                              url,
                              '',
                              status=self.STATUS_FAIL)
            return

        # 存储数据
        self.append_model(data, Model.change_info, url, r.text)
Exemplo n.º 10
0
    def get_contributive_info(self, session, i_d, data):
        url = 'http://{host}/api/PubBaseInfo/Invs/{id}?_={rand}'.format(
            host=self.host, id=i_d, rand=util.get_time_stamp())
        r = self.task_request(session, session.get, url)
        if r is None:
            self.append_model(data,
                              Model.contributive_info,
                              url,
                              '',
                              status=self.STATUS_FAIL)
            return

        # 存储数据
        self.append_model(data, Model.contributive_info, url, r.text)

        json_data = util.json_loads(r.text)
        if json_data is None:
            self.append_model(data,
                              Model.contributive_info,
                              url,
                              r.text,
                              status=self.STATUS_FAIL)
            return

        data_list = json_data.get('data', None)
        if data_list is None:
            return

        for index, item in enumerate(data_list):
            inv_id = item.get('invId', None)
            if inv_id is None:
                continue
            url = 'http://{host}/api/PubBaseInfo/InvDetail/{id}/{invid}'.format(
                host=self.host, id=i_d, invid=inv_id)
            r = self.task_request(session, session.get, url)
            if r is None:
                self.append_model(data,
                                  Model.contributive_info,
                                  url,
                                  '',
                                  status=self.STATUS_FAIL,
                                  classify=Model.type_detail)
                continue

            self.append_model(data,
                              Model.contributive_info,
                              url,
                              r.text,
                              classify=Model.type_detail)
Exemplo n.º 11
0
    def get_annual_info(self, session, text, data):
        pattern = 'showNbDetail\(\'(.*?)\',\'(.*?)\'\);'
        search_list = re.findall(pattern, text)
        if len(search_list) <= 0:
            return

        for item in search_list:
            url = 'http://{host}/ztxy.do?method=qyinfo_nnbxx&pripid={pripid}&nd={year}&random={rand}'.format(
                host=self.host, pripid=item[0], year=item[1], rand=util.get_time_stamp())
            r = self.task_request(session, session.get, url)
            if r is None:
                self.append_model(data, Model.annual_info, url, '',
                                  status=self.STATUS_FAIL,
                                  year=item[1], classify=Model.type_detail)
                continue
            self.append_model(data, Model.annual_info, url, r.text,
                              year=item[1],
                              classify=Model.type_detail)
Exemplo n.º 12
0
    def get_contributive_info(self, session, base_text, data):
        pattern = 'showRyxx\(\'(.*?)\',\'(.*?)\',\'(.*?)\'\)'
        search_list = re.findall(pattern, base_text)
        length = len(search_list)
        if length <= 0:
            return

        for index, item in enumerate(search_list):
            url = 'http://{host}/ztxy.do?method=frInfoDetail&maent.xh={xh}&maent.pripid={pripid}&isck={issck}&random={rand}'.format(
                host=self.host, xh=item[0], pripid=item[1], issck=item[2], rand=util.get_time_stamp())
            r = self.task_request(session, session.get, url)
            if r is None:
                self.append_model(data, Model.contributive_info, url, '',
                                  status=self.STATUS_FAIL,
                                  classify=Model.type_detail)
                continue

            self.append_model(data, Model.contributive_info, url, r.text,
                              classify=Model.type_detail)
Exemplo n.º 13
0
    def get_detail_html_list(self, seed, session, param_list):
        # 保存企业名称
        data_list = []
        for item in param_list:
            try:
                href = item.get('href', None)
                referer = item.get('Referer', None)
                if href is None or referer is None:
                    self.log.error('参数存储异常: item = {item}'.format(item=item))
                    continue

                url = 'http://{host}/client/entsearch/{href}'.format(host=self.host, href=href)
                search_name = item.get('search_name', None)
                if search_name is None:
                    self.log.error('参数错误: item = {item}'.format(item=item))
                    continue

                session.headers = {
                    "Host": self.host,
                    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0",
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                    "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                    "Accept-Encoding": "gzip, deflate",
                    "Connection": "keep-alive",
                    "Referer": referer,
                }
                # 基本信息
                base_text = self.get_base_info(session, url)
                if base_text is None:
                    continue
                if base_text.strip() == '':
                    continue

                # 页面不正确
                pri_pid = PyQuery(base_text, parser='html').find('#priPID').attr('value')
                if pri_pid is None:
                    continue

                # 获得公司名称
                company = self.__get_company_name(base_text)
                if company is None or company == '':
                    self.log.error('公司名称解析失败..item = {item} {text}'.format(
                        text=base_text, item=item))
                    continue
                # 建立数据模型
                data = self.get_model(company, seed, search_name, self.province)
                # yearreport_url = 'http://{host}/entinfo/list.json?_t={rand}'.format(
                #    host=self.host, rand=util.get_time_stamp())
                # yearreport_data = {
                #     'params[priPID]': pri_pid
                # }
                contributive_url = 'http://{host}/midinv/list.json?_t={rand}'.format(
                    host=self.host, rand=util.get_time_stamp())
                contributive_data = {
                    'params[priPID]': pri_pid,
                    'start': '0',
                    'length': '1000'
                }
                member_url = 'http://{host}/midmember/list.json?_t={rand}'.format(
                    host=self.host, rand=util.get_time_stamp())
                member_data = {
                    'priPID': pri_pid,
                }
                branch_url = 'http://{host}/midbranch/list.json?_t={rand}'.format(
                    host=self.host, rand=util.get_time_stamp())
                branch_data = {
                    'priPID': pri_pid,
                }
                change_url = 'http://{host}/midaltitem/list.json?_t={rand}'.format(
                    host=self.host, rand=util.get_time_stamp())
                change_data = {
                    'params[priPID]': pri_pid,
                    'start': '0',
                    'length': '1000'
                }
                shareholder_url = 'http://{host}/im/pub/investalter/investmentListJSON?_t={rand}&pageNum=0&' \
                                  'priPID={priPID}&length={length}&params%5BpageNum%5D=0'. \
                    format(host=self.host, rand=util.get_time_stamp(), priPID=pri_pid, length='1000')

                annual_url = 'http://{host}/entinfo/list.json?_t={rand}'.format(
                    host=self.host, rand=util.get_time_stamp())
                annual_data = {
                    'params[priPID]': pri_pid,
                    'start': '0',
                    'length': '10'
                }

                # 存储数据
                self.append_model(data, Model.base_info, url, base_text)

                time.sleep(0.5)
                # 出资信息
                if not self.get_contributive_info(session, contributive_url, contributive_data, data):
                    self.log.warn('出资信息抓取失败....pripid = {pripid}'.format(pripid=pri_pid))
                    continue
                time.sleep(0.5)
                # 主要人员信息
                if not self.get_key_person_info(session, member_url, member_data, data):
                    self.log.warn('主要人员抓取失败....pripid = {pripid}'.format(pripid=pri_pid))
                    continue
                time.sleep(0.5)
                # 分支机构
                if not self.get_branch_info(session, branch_url, branch_data, data):
                    self.log.warn('分支机构抓取失败....pripid = {pripid}'.format(pripid=pri_pid))
                    continue
                time.sleep(0.5)
                # 变更信息
                if not self.get_change_info(session, change_url, change_data, data):
                    self.log.warn('变更信息抓取失败....pripid = {pripid}'.format(pripid=pri_pid))
                    continue
                time.sleep(0.5)
                # 股东信息
                if not self.get_shareholder_info(session, shareholder_url, data):
                    self.log.warn('股东信息抓取失败....pripid = {pripid}'.format(pripid=pri_pid))
                    continue
                time.sleep(0.5)
                # 获得年报信息
                if not self.get_annual_info(session, href, annual_url, annual_data, data):
                    self.log.warn('年报信息抓取失败....pripid = {pripid}'.format(pripid=pri_pid))
                    continue

                data_list.append(data)
            except Exception as e:
                self.log.exception(e)

        return self.sent_to_target(data_list)
Exemplo n.º 14
0
    def get_annual_info(self, session, href, annual_url, data, total_data):
        encry_pri_pid = util.get_match_value('docId=', '&classFlag', href)
        if encry_pri_pid is None:
            return False
        r = self.filter_request(session, session.post, url=annual_url, data=data)
        if r is None:
            return False
        r_text = util.json_loads(r.text)
        if r_text is None:
            return False
        r_data = r_text.get('data')
        if r_data is None:
            return False
        for data in r_data:
            year = data.get('year')
            year_id = data.get('anCheID')
            if year is None or year_id is None:
                continue

            post_data1 = {'anCheID': year_id}
            post_data2 = {'start': '0', 'length': '100', 'params[anCheID]': year_id}
            # 基本信息
            base_info_url = 'http://{host}/entinfo/yrinfo?year={year}&encryPriPID={encry_pri_pid}&classFlag=1'.format(
                host=self.host, year=year, encry_pri_pid=encry_pri_pid)
            r = self.filter_request(session, session.get, base_info_url)
            if r is not None:
                self.append_model(total_data, Model.annual_info, base_info_url, r.text,
                                  year=year,
                                  classify=Model.type_detail)
            else:
                self.append_model(total_data, Model.annual_info, base_info_url, '',
                                  status=self.STATUS_FAIL,
                                  year=year,
                                  classify=Model.type_detail)
                return False

            # 网站信息
            web_info_url = 'http://{host}/pub/WebsiteInfo/publist.json?_t={rand}'.format(
                host=self.host, rand=util.get_time_stamp())
            r = self.filter_request(session, session.post, web_info_url,
                                    data=post_data1)
            if r is not None:
                self.append_model(total_data, Model.annual_info, web_info_url, r.text, post_data=post_data1,
                                  year=year,
                                  classify=Model.type_detail)
            else:
                self.append_model(total_data, Model.annual_info, web_info_url, '', post_data=post_data1,
                                  status=self.STATUS_FAIL,
                                  year=year,
                                  classify=Model.type_detail)
                return False

            # 股东信息
            shareholder_info_url = 'http://{host}/pub/subcapitalInfo/publist.json?_t={rand}'.format(
                host=self.host, rand=util.get_time_stamp())
            r = self.filter_request(session, session.post, shareholder_info_url,
                                    data=post_data2)
            if r is not None:
                self.append_model(total_data, Model.annual_info, shareholder_info_url, r.text, post_data=post_data2,
                                  year=year,
                                  classify=Model.type_detail)
            else:
                self.append_model(total_data, Model.annual_info, shareholder_info_url, '', post_data=post_data2,
                                  status=self.STATUS_FAIL,
                                  year=year,
                                  classify=Model.type_detail)
                return False

            # 对外投资
            investment_info_url = 'http://{host}/pub/forinvestMentInfo/publist.json?_t={rand}'.format(
                host=self.host, rand=util.get_time_stamp())
            r = self.filter_request(session, session.post, investment_info_url,
                                    data=post_data1)
            if r is not None:
                self.append_model(total_data, Model.annual_info, investment_info_url, r.text, post_data=post_data1,
                                  year=year,
                                  classify=Model.type_detail)
            else:
                self.append_model(total_data, Model.annual_info, investment_info_url, '', post_data=post_data1,
                                  status=self.STATUS_FAIL,
                                  year=year,
                                  classify=Model.type_detail)
                return False

            # 资产状况 在基本信息里

            # 担保信息
            assurance_info_url = 'http://{host}/pub/GuaranteeInfo/publist.json?_t={rand}'.format(
                host=self.host, rand=util.get_time_stamp())
            r = self.filter_request(session, session.post, assurance_info_url,
                                    data=post_data2)
            if r is not None:
                self.append_model(total_data, Model.annual_info, assurance_info_url, r.text, post_data=post_data2,
                                  year=year,
                                  classify=Model.type_detail)
            else:
                self.append_model(total_data, Model.annual_info, assurance_info_url, '', post_data=post_data2,
                                  status=self.STATUS_FAIL,
                                  year=year,
                                  classify=Model.type_detail)
                return False

            # 股权变更
            change_info_url = 'http://{host}/pub/alterStockInfo/publist.json?_t={rand}'.format(
                host=self.host, rand=util.get_time_stamp())
            r = self.filter_request(session, session.post, change_info_url,
                                    data=post_data2)
            if r is not None:
                self.append_model(total_data, Model.annual_info, change_info_url, r.text, post_data=post_data2,
                                  year=year,
                                  classify=Model.type_detail)
            else:
                self.append_model(total_data, Model.annual_info, change_info_url, '', post_data=post_data2,
                                  status=self.STATUS_FAIL,
                                  year=year,
                                  classify=Model.type_detail)
                return False

            # 修改记录
            amendant_info_url = 'http://{host}/pub/updateinfo/publist.json?_t={rand}'.format(
                host=self.host, rand=util.get_time_stamp())
            r = self.filter_request(session, session.post, amendant_info_url,
                                    data=post_data2)
            if r is not None:
                self.append_model(total_data, Model.annual_info, amendant_info_url, r.text, post_data=post_data2,
                                  year=year,
                                  classify=Model.type_detail)
            else:
                self.append_model(total_data, Model.annual_info, amendant_info_url, '', post_data=post_data2,
                                  status=self.STATUS_FAIL,
                                  year=year,
                                  classify=Model.type_detail)
                return False
        return True
Exemplo n.º 15
0
    def get_annual_info(self, session, pripid, pritype, data):
        # 年报信息
        annual_info_url = 'http://{host}/gsxt/api/anbaseindex/queryList/{pripid}/{pritype}?currentpage=1&pagesize=100&t={rand}'.format(
            host=self.host, pripid=pripid, pritype=pritype, rand=util.get_time_stamp())
        r = self.task_request(session, session.get, annual_info_url)
        if r is None:
            return None

        json_data = util.json_loads(r.text)
        if json_data is None:
            return None

        for item in json_data:
            nb_list = item.get('list', None)
            if nb_list is None:
                continue

            for nb_item in nb_list:
                anche_id = nb_item.get('ancheid', None)
                anche_year = nb_item.get('ancheyear', None)
                if anche_id is None:
                    continue
                if anche_year is None:
                    continue
                # 基本信息
                base_info_url = 'http://{host}/gsxt/api/anbaseinfo/queryForm/{ancheid}?currentpage=1&pagesize=100&t={rand}'.format(
                    host=self.host, ancheid=anche_id, rand=util.get_time_stamp())
                r = self.task_request(session, session.get, base_info_url)
                if r is not None:
                    self.append_model(data, Model.annual_info, base_info_url, r.text,
                                      year=anche_year,
                                      classify=Model.type_detail)
                else:
                    self.append_model(data, Model.annual_info, base_info_url, '',
                                      status=self.STATUS_FAIL,
                                      year=anche_year,
                                      classify=Model.type_detail)

                # 网站信息
                web_info_url = 'http://{host}/gsxt/api/anwebsiteinfo/queryList/{ancheid}/{pripid}/{pritype}?currentpage=1&pagesize=100&t={rand}'.format(
                    host=self.host, ancheid=anche_id, pritype=pritype, rand=util.get_time_stamp(), pripid=pripid)
                r = self.task_request(session, session.get, web_info_url)
                if r is not None:
                    self.append_model(data, Model.annual_info, web_info_url, r.text,
                                      year=anche_year,
                                      classify=Model.type_detail)
                else:
                    self.append_model(data, Model.annual_info, web_info_url, '',
                                      status=self.STATUS_FAIL,
                                      year=anche_year,
                                      classify=Model.type_detail)

                # 股东信息
                shareholder_info_url = 'http://{host}/gsxt/api/ansubcapital/queryList/{ancheid}?currentpage=1&pagesize=100&t={rand}'.format(
                    host=self.host, ancheid=anche_id, rand=util.get_time_stamp())
                r = self.task_request(session, session.get, shareholder_info_url)
                if r is not None:
                    self.append_model(data, Model.annual_info, shareholder_info_url, r.text,
                                      year=anche_year,
                                      classify=Model.type_detail)
                else:
                    self.append_model(data, Model.annual_info, shareholder_info_url, '',
                                      status=self.STATUS_FAIL,
                                      year=anche_year,
                                      classify=Model.type_detail)

                # 对外投资
                investment_info_url = 'http://{host}/gsxt/api/anforinvestment/queryList/{ancheid}?currentpage=1&pagesize=100&t={rand}'.format(
                    host=self.host, ancheid=anche_id, rand=util.get_time_stamp())
                r = self.task_request(session, session.get, investment_info_url)
                if r is not None:
                    self.append_model(data, Model.annual_info, investment_info_url, r.text,
                                      year=anche_year,
                                      classify=Model.type_detail)
                else:
                    self.append_model(data, Model.annual_info, investment_info_url, '',
                                      status=self.STATUS_FAIL,
                                      year=anche_year,
                                      classify=Model.type_detail)

                # 资产状况
                assets_info_url = 'http://{host}/gsxt/api/anbaseinfo/queryForm/{ancheid}?currentpage=1&pagesize=100&t={rand}'.format(
                    host=self.host, ancheid=anche_id, rand=util.get_time_stamp())
                r = self.task_request(session, session.get, assets_info_url)
                if r is not None:
                    self.append_model(data, Model.annual_info, assets_info_url, r.text,
                                      year=anche_year,
                                      classify=Model.type_detail)
                else:
                    self.append_model(data, Model.annual_info, assets_info_url, '',
                                      status=self.STATUS_FAIL,
                                      year=anche_year,
                                      classify=Model.type_detail)

                # 担保信息
                assurance_info_url = 'http://{host}/gsxt/api/anforguaranteeinfo/queryList/{ancheid}?currentpage=1&pagesize=100&t={rand}'.format(
                    host=self.host, ancheid=anche_id, rand=util.get_time_stamp())
                r = self.task_request(session, session.get, assurance_info_url)
                if r is not None:
                    self.append_model(data, Model.annual_info, assurance_info_url, r.text,
                                      year=anche_year,
                                      classify=Model.type_detail)
                else:
                    self.append_model(data, Model.annual_info, assurance_info_url, '',
                                      status=self.STATUS_FAIL,
                                      year=anche_year,
                                      classify=Model.type_detail)

                # 社保信息
                social_security_info_url = 'http://{host}/gsxt/api/ansocialinsuinfo/queryForm/{ancheid}?currentpage=1&pagesize=100&t={rand}'.format(
                    host=self.host, ancheid=anche_id, rand=util.get_time_stamp())
                r = self.task_request(session, session.get, social_security_info_url)
                if r is not None:
                    self.append_model(data, Model.annual_info, social_security_info_url, r.text,
                                      year=anche_year,
                                      classify=Model.type_detail)
                else:
                    self.append_model(data, Model.annual_info, social_security_info_url, '',
                                      status=self.STATUS_FAIL,
                                      year=anche_year,
                                      classify=Model.type_detail)

                # 股权变更
                change_info_url = 'http://{host}/gsxt/api/analterstockinfo/queryList/{ancheid}?currentpage=1&pagesize=100&t={rand}'.format(
                    host=self.host, ancheid=anche_id, rand=util.get_time_stamp())
                r = self.task_request(session, session.get, change_info_url)
                if r is not None:
                    self.append_model(data, Model.annual_info, change_info_url, r.text,
                                      year=anche_year,
                                      classify=Model.type_detail)
                else:
                    self.append_model(data, Model.annual_info, change_info_url, '',
                                      status=self.STATUS_FAIL,
                                      year=anche_year,
                                      classify=Model.type_detail)

                # 修改记录
                amendant_info_url = 'http://{host}/gsxt/api/anupdateinfo/queryList/{ancheid}?currentpage=1&pagesize=100&t={rand}'.format(
                    host=self.host, ancheid=anche_id, rand=util.get_time_stamp())
                r = self.task_request(session, session.get, amendant_info_url)
                if r is not None:
                    self.append_model(data, Model.annual_info, amendant_info_url, r.text,
                                      year=anche_year,
                                      classify=Model.type_detail)
                else:
                    self.append_model(data, Model.annual_info, amendant_info_url, '',
                                      status=self.STATUS_FAIL,
                                      year=anche_year,
                                      classify=Model.type_detail)
Exemplo n.º 16
0
    def get_detail_html_list(self, seed, session, param_list):
        data_list = []
        session.headers = {
            'Host': self.host,
            'Connection': 'keep-alive',
            'Pragma': 'no-cache',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/55.0.2883.95 Safari/537.36',
            'Accept': 'application/json, text/plain, */*',
            'Cache-Control': 'no-cache',
            'X-Requested-With': 'XMLHttpRequest',
            'appkey': '8dc7959eeee2792ac2eebb490e60deed',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
        }

        for item in param_list:
            try:
                pri_pid = item.get('pripid', None)
                pri_type = item.get('pritype', None)
                if pri_pid is None or pri_type is None:
                    self.log.error('参数信息错误...item = {item}'.format(item=item))
                    continue

                search_name = item.get('search_name', None)
                if search_name is None:
                    self.log.error('参数错误: item = {item}'.format(item=item))
                    continue

                base_info_url = "http://{host}/gsxt/api/ebaseinfo/queryForm/{pripid}/{pritype}?currentpage=1&pagesize=5&t={rand}".format(
                    host=self.host, pripid=pri_pid, pritype=pri_type, rand=util.get_time_stamp())

                # 基本信息
                base_info = self.task_request(session, session.get, base_info_url)
                if base_info is None:
                    continue

                if len(base_info.text) <= 15:

                    base_info_url = 'http://{host}/gsxt/api/ebaseindex/queryForm/{pripid}/{pritype}?currentpage=1&pagesize=5&t={rand}'.format(
                        host=self.host, pripid=pri_pid, pritype=pri_type, rand=util.get_time_stamp())
                    base_info = self.task_request(session, session.get, base_info_url)
                    if base_info is None:
                        self.log.info('基本信息抓取失败: pripid = {pripid} text = {text}'.format(
                            pripid=pri_pid, text=base_info.text))
                        continue

                company = self.get_copmany_name(base_info.text)
                if company == '' or company is None:
                    self.log.error('公司名称信息解析错误..pripid = {pripid} {text}'.format(
                        pripid=pri_pid, text=base_info.text))
                    continue

                # 建立数据模型
                data = self.get_model(company, seed, search_name, self.province)

                # 变更信息
                change_info_url = 'http://{host}/gsxt/api/ealterrecoder/queryList/{pripid}/{pritype}?currentpage=1&pagesize=100&t={rand}'.format(
                    host=self.host, pripid=pri_pid, pritype=pri_type, rand=util.get_time_stamp())

                # 出资信息
                contributive_info_url = 'http://{host}/gsxt/api/einv/gdjczxxList/{pripid}/{pritype}?currentpage=1&pagesize=100&t={rand}'.format(
                    host=self.host, pripid=pri_pid, pritype=pri_type, rand=util.get_time_stamp())

                # 主要人员
                key_person_info_url = 'http://{host}/gsxt/api/epriperson/queryList/{pripid}/{pritype}?currentpage=1&pagesize=100&t={rand}'.format(
                    host=self.host, pripid=pri_pid, pritype=pri_type, rand=util.get_time_stamp())

                # 分支机构
                branch_info_url = 'http://{host}/gsxt/api/ebrchinfo/queryList/{pripid}/{pritype}?currentpage=1&pagesize=100&t={rand}'.format(
                    host=self.host, pripid=pri_pid, pritype=pri_type, rand=util.get_time_stamp())

                # 清算信息
                liquidation_info_url = 'http://{host}/gsxt/api/eliqmbrn/queryList/{pripid}/{pritype}?currentpage=1&pagesize=100&t={rand}'.format(
                    host=self.host, pripid=pri_pid, pritype=pri_type, rand=util.get_time_stamp())

                # 股东信息
                shareholder_info_url = 'http://{host}/gsxt/api/eiminvupdate/queryList/{pripid}/{pritype}?currentpage=1&pagesize=100&t={rand}'.format(
                    host=self.host, pripid=pri_pid, pritype=pri_type, rand=util.get_time_stamp())

                # 存储数据
                self.append_model(data, Model.base_info, base_info_url, base_info.text)

                # 清算信息
                liquidation_info = self.task_request(session, session.get, liquidation_info_url)
                if liquidation_info is not None:
                    self.append_model(data, Model.liquidation_info, liquidation_info_url, liquidation_info.text)
                else:
                    self.append_model(data, Model.liquidation_info, liquidation_info_url, '', status=self.STATUS_FAIL)

                # 变更信息
                change_info = self.task_request(session, session.get, change_info_url)
                if change_info is not None:
                    self.append_model(data, Model.change_info, change_info_url, change_info.text)
                else:
                    self.append_model(data, Model.change_info, change_info_url, '', status=self.STATUS_FAIL)

                # 股东信息
                shareholder_info = self.task_request(session, session.get, shareholder_info_url)
                if shareholder_info is not None:
                    self.append_model(data, Model.shareholder_info, shareholder_info_url, shareholder_info.text)
                else:
                    self.append_model(data, Model.shareholder_info, shareholder_info_url, '', status=self.STATUS_FAIL)

                # 出资信息
                contributive_info = self.task_request(session, session.get, contributive_info_url)
                if contributive_info is not None:
                    self.append_model(data, Model.contributive_info, contributive_info_url,
                                      contributive_info.text)
                    self.get_contributive_info_detail(session, contributive_info.text, data)
                else:
                    self.append_model(data, Model.contributive_info, contributive_info_url,
                                      '', status=self.STATUS_FAIL)

                # 主要人员
                key_person_info = self.task_request(session, session.get, key_person_info_url)
                if key_person_info is not None:
                    self.append_model(data, Model.key_person_info, key_person_info_url, key_person_info.text)
                else:
                    self.append_model(data, Model.key_person_info, key_person_info_url, '', status=self.STATUS_FAIL)

                # 分支机构
                branch_info = self.task_request(session, session.get, branch_info_url)
                if branch_info is not None:
                    self.append_model(data, Model.branch_info, branch_info_url, branch_info.text)
                else:
                    self.append_model(data, Model.branch_info, branch_info_url, '', status=self.STATUS_FAIL)

                # 获得年报信息
                self.get_annual_info(session, pri_pid, pri_type, data)

                data_list.append(data)
            except Exception as e:
                self.log.exception(e)

        return self.sent_to_target(data_list)