Пример #1
0
    def list_parser(self, processor_type, current_processor, response, passvalue):
        """list 解析"""
        if not response:
            logger.error('processor_type:{}, passvalue:{}'.format(processor_type, passvalue))
            return

        # 获取配置参数
        table_selector = current_processor.get('table_selector') or 'table#list'

        # 获取数据记录
        records = self._get_data_table(response, table_selector, current_processor)
        list_values = self._get_list_values(records, current_processor)
        if current_processor.get('tr_select'):
            for i in list_values:
                if len(i) == current_processor.get('tr_select'):
                    list_values = list_values
                else:
                    list_values = []
        # 获取入库字段和值
        list_infos, next_param_list = self._get_column_value_mapping(current_processor, list_values, passvalue)

        # 钻取详情
        detail_urls = [''] if current_processor.get('ignore_detail_urls') else self._get_detail_urls(records)
        #判断是否讲response传入下一级
        if not current_processor.get('next_response'):
            self._route_many_points_many_urls(current_processor.get('detail_point'),
                                          detail_urls, passvalue, next_param_list)
        if current_processor.get('next_response'):
            self._route_many_points_many_urls(current_processor.get('detail_point'),
                                              [''], passvalue, next_param_list, response)

        # 获取列表的下一页
        self._route_next_page(response, processor_type, passvalue)

        return list_infos
Пример #2
0
 def get_query_reason(self):
     '''
     查询原因选择
     :return:
     '''
     try:
         orgcode = os.getenv('orgcode')
         sql = "select QUERY_REASON from {0}.RISK_MNIT_QUERY_RH_LIST where MID_SIGN_CODE='{1}'".format(
             base_config.tabschema, orgcode)
         result = ibm_db.exec_immediate(self.conn, sql)
         orgdetail = ibm_db.fetch_assoc(result)
         if isinstance(orgdetail, bool):
             return ''
         else:
             if orgdetail.get('QUERY_REASON') == '贷前调查':
                 return '01'
             elif orgdetail.get('QUERY_REASON') == '贷中操作':
                 return '02'
             elif orgdetail.get('QUERY_REASON') == '贷后管理':
                 return '03'
             elif orgdetail.get('QUERY_REASON') == '关联查询':
                 return '04'
             else:
                 return ''
     except Exception as e:
         logger.error("查询原因获取失败error", e)
         return ''
Пример #3
0
 def get_processer(self, processor_type, *args):
     """
         主要处理逻辑函数,根据配置文件,获取执行逻辑处理的先后顺序
     """
     current_processor = processor_config.processor.get(processor_type)
     if not current_processor:
         logger.error(
             "i can't found the processor, processor_type:{}, args:{}".
             format(processor_type, *args))
         return
     object_list = [self.download, self.output]
     for obj in object_list:
         try:
             # 根据当前的type 获取当前数据的解析函数
             f = getattr(obj, current_processor.get('function'))
         except AttributeError as e:
             pass
         else:
             # 执行当前函数并将处理结果返回,供后续处理
             # 支持多列表处理
             result = f(processor_type, *args)
             next_processor = current_processor.get('next_processor')
             if next_processor:
                 if isinstance(next_processor, str):
                     self.queue.put([next_processor, result])
                 if isinstance(next_processor, list):
                     for p in next_processor:
                         if p:
                             self.queue.put([p, result])
             break
     else:
         logger.error(
             "i can't found the function, processor_type:{}, args:{}".
             format(processor_type, *args))
         return
Пример #4
0
 def _route_one_point_many_urls(self, point, urls, passvalue, other_passvalue_list, rsp):
     try:
         for index, url in enumerate(urls):
             new_passvalue = dict(passvalue)
             if other_passvalue_list and other_passvalue_list[index]:
                 for key in other_passvalue_list[index]:
                     new_passvalue[key] = other_passvalue_list[index].get(key)
             self._route_one_point_one_url(point, url, new_passvalue, rsp)
     except Exception as e:
         logger.error("超出数组范围!", e)
Пример #5
0
    def write_log_with_zhengxin(self):
        '''
        记录登陆人行征信查询页面的次数及日志。用于以后的记录查询
        :return:
        '''
        try:
            orgcode = os.getenv('orgcode')
            # 获取所要查询机构代码的查询用户
            # USER_CODE = ibm_db.exec_immediate(self.conn,"select USER_CODE,CUST_NAME,CLIENT_TYPE from RH.PRIORITY_CLIENT_LIST where ORG_CODE='{0}'".format(orgcode))
            # usercode = ibm_db.fetcha_assoc(USER_CODE)
            # 获取查询用户的机构和部门
            # ENT_NAME 企业名称
            # USER_NAME 用户名称
            # USER_CODE 用户CODE
            # USER_ORG_NAME 用户归属机构名称
            # USER_BRANCH_NAME 用户所属分行名称

            # 判断改客户是否已经爬取过
            query = "select distinct ORG_CODE,MIDSIGNCODE,CUS_NO from {0}.RH_CUST_QUEUE where MIDSIGNCODE='{1}';".format(
                base_config.tabschema, orgcode)
            org_Info = ibm_db.exec_immediate(self.conn, query)
            row = ibm_db.fetch_assoc(org_Info)
            if row:
                status = '0'
            if not row:
                status = '1'
            # 添加登陆信息到记录表中
            sql = "select ENT_NAME,USER_NAME,USER_CODE,USER_ORG_NAME,USER_BRANCH_NAME,QUERY_REASON from RH.RISK_MNIT_QUERY_RH_LIST where MID_SIGN_CODE='{0}'".format(
                orgcode)
            result = ibm_db.exec_immediate(self.conn, sql)
            orgdetail = ibm_db.fetch_assoc(result)
            if not isinstance(orgdetail, bool):
                current_data = time.strftime("%Y-%m-%d %H:%M:%S",
                                             time.localtime(time.time()))
                insertsql = "insert into RH.RH_LOGIN_ZHENGXIN_WEB_LOG" \
                            " (USER_CODE,CUST_NAME,ORGCODE,USERID,PASSWORD,STATUS,UPLOADTIME,CLIENT_TYPE,ORG_NAME,SEARCH_TYPE,USER_NAME,USER_BRANCH_NAME,CUST_CODE) " \
                            "values ('{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}','{8}','{9}','{10}','{11}','{12}')".format(
                    orgdetail.get('USER_CODE', ''),
                    orgdetail.get('ENT_NAME', ''), base_config.login_post_data.get('orgCode'),
                    base_config.login_post_data.get('userid'),
                    hashlib.md5(base_config.login_post_data.get('password').encode('utf8')).hexdigest(),
                    os.getenv('credit_status'), current_data, status,
                    orgdetail.get('USER_ORG_NAME', ''), orgdetail.get('QUERY_REASON', ''), orgdetail.get('USER_NAME', ''),
                    orgdetail.get('USER_BRANCH_NAME', ''), orgcode)
                # print(insertsql)
                ibm_db.exec_immediate(self.conn, insertsql)
                ibm_db.commit(self.conn)
            else:
                print("企业信息为空")
        except Exception as e:
            logger.error("添加登陆记录时出错error", e)
Пример #6
0
 def save_dict_into_mysql(self, processor_type, data):
     if not isinstance(data, dict):
         return
     table = data.pop('table')
     placeholders = ','.join(['%s'] * len(data))
     columns = ','.join(data.keys())
     sql = "INSERT INTO {}({}) VALUES({})".format(table, columns,
                                                  placeholders)
     # debug
     logger.info('processor_type:[{}], sql:[{} {}]'.format(
         processor_type, sql, tuple(data.values())))
     try:
         self.cursor.execute(sql, tuple(data.values()))
     except pymysql.Error as e:
         logger.error('Oops, i got the fault: {}, {} {}'.format(
             e, sql, tuple(data.values())))
     self.conn.commit()
Пример #7
0
    def double_parser(self, processor_type, current_processor, response, passvalue):
        '''对于一个表格里面需要拆分的处理'''
        if not response:
            logger.error('processor_type:{}, passvalue:{}'.format(processor_type, passvalue))
            return

        # 获取配置参数
        table_selector = current_processor.get('table_selector') or 'table#list'

        # 获取数据记录
        records = self._get_data_table(response, table_selector, current_processor)
        list_values = self._double_parser(records, current_processor)

        # 获取入库字段和值
        list_infos, next_param_list = self._get_column_value_mapping(current_processor, list_values, passvalue)

        return list_infos
Пример #8
0
 def _get_data_table(response, table_selector, current_processor):
     """获取含数据的table"""
     if not response:
         return
     try:
         soup = BeautifulSoup(response.content, 'html.parser', from_encoding=page_encoding)
     except AttributeError as e:
         soup = BeautifulSoup(response, 'html.parser', from_encoding=page_encoding)
     records = soup.select(table_selector)
     table_number = current_processor.get('table_number')
     if table_number:
         try:
             records = [(records[table_number])]
         except IndexError as e:
             logger.error('{}'.format(e.args))
             return
     return records
Пример #9
0
    def update_spider_done(self, orgcode):
        '''爬取完成的企业进行入库记录'''
        try:
            #1.更新待爬清单的状态为爬取完成
            update_spider_queue_table_sql = "update {0}.RH_SPIDER_QUEUE_LIST set STATUS='2' where ORGCODE='{1}' and SEARCHDATE='{2}';".format(
                base_config.tabschema.upper(), orgcode,
                os.getenv('last_quarter_end'))
            ibm_db.exec_immediate(self.conn, update_spider_queue_table_sql)
            ibm_db.commit(self.conn)
            #2.更新爬取完成清单的内容,如果没有出错,就插入爬取完成表。出错就不插入
            error_spider_record_sql = "select ORGCODE from {0}.RH_ERROR_SPIDER where SEARCHDATE='{1}' and ORGCODE='{2}' and date(UPLOADTIME)='{3}';".format(
                base_config.tabschema.upper(),
                os.getenv('last_quarter_end'), orgcode,
                time.strftime('%Y-%m-%d', time.localtime(time.time())))
            error_spider_record = ibm_db.exec_immediate(
                self.conn, error_spider_record_sql)
            row = ibm_db.fetch_assoc(error_spider_record)
            # if row:
            #     pass
            # if not row:

            # 成功与否,都添加记录到RH_CUST_QUEUE中
            #select CUSTNAME,CUSTID from DB2IIASS.DESK_SXGL0431_D where ORGCERTCODE='93125730-6';
            sql = "select distinct CUSTNAME,CUSTID from  {0}.DESK_SXGL0431_D where LNCARDNO='{1}';".format(
                base_config.hongduntabschema.upper(), orgcode)
            result_query = ibm_db.exec_immediate(self.conn, sql)
            row = ibm_db.fetch_assoc(result_query)
            while (row):
                # print(row)
                insert_sql = "insert into {0}.RH_CUST_QUEUE(ENT_NAME,CUS_NO,ORG_CODE,MIDSIGNCODE,SEARCHDATE,FINISH_DATE) values ('{1}','{2}','{3}','{4}','{5}','{6}')".format(
                    base_config.tabschema, row.get('CUSTNAME'),
                    row.get('CUSTID'), orgcode, os.getenv('midsigncode'),
                    os.getenv('last_quarter_end'),
                    time.strftime("%Y-%m-%d %H:%M:%S",
                                  time.localtime(time.time())))

                ibm_db.exec_immediate(self.conn, insert_sql)
                ibm_db.commit(self.conn)
                row = ibm_db.fetch_assoc(result_query)
                #os.environ['credit_status'] = '0'
                #HtmlOutputer().write_log_with_zhengxin()
        except Exception as e:
            logger.error("爬取完成的企业进行入库记录error", e)
            pass
Пример #10
0
    def detail_parser(self, processor_type, current_processor, response, passvalue):
        """detail 解析"""
        if not response:
            logger.error('processor_type:{}, passvalue:{}'.format(processor_type, passvalue))
            return

        # 获取配置参数
        table_selector = current_processor.get('table_selector') or 'table#list > tbody > tr > td'

        # 获取数据记录
        records = self._get_data_table(response, table_selector, current_processor)

        detail_values = self._get_detail_values(records)
        # 获取入库字段和值
        list_infos, next_param_list = self._get_column_value_mapping(current_processor, [detail_values], passvalue)

        # 钻取详情
        self._route_many_points_many_urls(current_processor.get('detail_point'),
                                          [''], passvalue, next_param_list, response)

        return list_infos
Пример #11
0
 def re_add_post__parser(self, processor_type, current_processor, response, passvalue):
     '''
         正则匹配出所需要的post的表单数据
     '''
     if not response:
         logger.error('processor_type:{}, passvalue:{}'.format(processor_type, passvalue))
         return
     import re
     content = response.text
     loanid = []
     loanid_list = re.findall("\(\'\d+\'\)", content)  # 获取loanid的id
     for i in loanid_list:
         loanid.append(str(i)[2])
     financecode = re.search('financecode\=\d+', content)
     financecode = financecode.group().split('=')[1] if financecode else ''
     loancardcode = re.search('\+.*\d+.?\+', content)
     loancardcode = loancardcode.group().split('+')[1].strip() if loancardcode else ''
     if current_processor.get("re_type"):
         contractcode = re.search('contractcode=[0-9A-Z]+', content)
         contractcode = contractcode.group().split('=')[1] if contractcode else ''
     else:
         contractcode = re.search('constr .*\= .*\;', content)
         contractcode = contractcode.group().split("'")[1].encode('gbk') if contractcode else  ''
     #loandb=1 担保  loandb=2 被担保  dzy=1 保证 2 抵押 3 质押
     dzy = current_processor.get('label_type')
     # loandb = re.search('&loandb\=\d+', content)
     # loandb = loandb.group().split('=')[1] if loandb else ''
     loandb = current_processor.get('loandb_type')
     # print("loanid=",loanid,"financecode=",financecode,"loancardcode=",loancardcode,"contractcode=",contractcode,"loandb=",loandb,'dzy=',dzy)
     # 钻取详情
     for i in loanid:
         next_param_list = [
             {'loanid': i, "financecode": financecode, "loancardcode": loancardcode, "contractcode": contractcode,
              'dzy': dzy, 'loandb': loandb}]
         self._route_many_points_many_urls(current_processor.get('detail_point'),
                                           [''], passvalue, next_param_list)
     return ''
Пример #12
0
 redis_conn = RedisOp(**base_config.redis_config)
 #从redis的队列中获取数据,首先获取优先级高的,其次是低优先级
 while 1:
     try:
         task_orgcode = redis_conn.run_redis_fun('brpop', [
             'high_level_spider_orgcode_queue',
             'low_level_spider_orgcode_queue'
         ], 0)
     except Exception as e:
         print(e)
     else:
         if task_orgcode:
             orgcode = task_orgcode[1].decode()
             if orgcode is None:
                 print("orgCode:", orgcode)
                 logger.error("待查询中征码为空,跳过本次查询!", orgcode)
             else:
                 os.environ['orgcode'] = orgcode
                 # os.environ['last_quarter_start'] = '2016-07-01'
                 # os.environ['last_quarter_end']= '2016-09-30'
                 rule_orgcode_filter(orgcode)
 # for i in ['71657512-5','17768539-1']:
 # import time
 # import os
 # os.environ['orgcode'] = '71657512-5'
 # starttime = time.time()
 # start_spider('55805200-2')
 # print('runtime=',time.time()-starttime)
 # time.sleep(3)
 '''
 江西赣锋锂业股份有限公司 71657512-5     3605030000000514    2016-12-01
Пример #13
0
                # print("orgcode>>>>>>>>>>>>>>>",orgcode)
                if orgcode:
                    #判断这个企业是否正在爬取
                    spider_doing = redis_conn.run_redis_fun(
                        'sismember', 'rh_spider_done_orgcode', orgcode)
                    if spider_doing:
                        print("正在爬取。。。")
                        redis_conn.run_redis_fun(
                            'lpush', 'request_url_failure_' + time.strftime(
                                '%Y%m%d', time.localtime(time.time())),
                            json.dumps(task))
                        redis_conn.run_redis_fun(
                            'expire', 'request_url_failure_' + time.strftime(
                                '%Y%m%d', time.localtime(time.time())), 604800)
                    else:
                        processor_type, *args = task.get(
                            'processor_type'), task.get('url'), task.get(
                                'payload')
                        q = Queue()
                        spider = SpiderByQueue(q)
                        spider.get_processer(processor_type, *args)
                        spider.run()
    except Exception as e:
        logger.error("重跑出错error", e)
    '''
江西赣锋锂业股份有限公司 71657512-5     3605030000000514    2016-12-01
中交第二航务工程局有限公司  17768539-1   4201010000043669    2016-12-01
贵州盛鑫矿业集团投资有限公司  56500196-7  5202010000176539   2016-12-01
临海市春风灯饰有限公司 70473721-1      3309060000195576    2016-12-01
    '''