Exemplo n.º 1
0
def cpca_addr(res):
    '''
    :param res: 列表
    :return:
    '''
    if os.path.exists('./addr_2.txt'):
        os.remove('./addr_2.txt')
    with open('./addr_2.txt', 'a', encoding='utf-8') as f:

        for re in res:
            print(re[0])
            l = [re[0].replace(' ', '')]
            if re[0]:
                df = transform(l, myumap, cut=False)
                for addr in df.index:
                    addr_3 = df.loc[addr].values[0]
                    addr_2 = df.loc[addr].values[1]
                    addr_1 = df.loc[addr].values[2]
                    if addr_1:
                        if (addr_3 == '雄县' or addr_3 == '容城县'
                                or addr_3 == '安新县', addr_3 == '容城'
                                or addr_3 == '安新') and addr_1 == '河北省':
                            # 测试 70
                            update_sql = "update tyc_qybj_jbxx set province ='{sheng}',city='{shi}',county='{xian}' where company_name='{comp_name}'".format(
                                sheng=addr_1,
                                shi=addr_2,
                                xian=addr_3,
                                comp_name=re[1])
                            # 本地
                            # update_sql = "update company_11315 set address_1 ='{sheng}',address_2='{shi}',address_3='{xian}' where company_name='{comp_name}'".format(sheng=addr_1, shi=addr_2, xian=addr_3, comp_name=re[1])
                            single_oracle.execute(update_sql)
                            # print(update_sql)
                            f.write(re[0] + '   省:' + addr_1 + ',市:' + addr_2 +
                                    ',县:' + addr_3 + '\n')
Exemplo n.º 2
0
    def put_11315_increased(self):
        '''
        11315新增企业名单
        :return:
        '''
        lock_uid = '11315_increased_s'
        status, sec = self.get_lock(lock_uid)
        if status:
            incre_length = self.server.llen('11315_increased')
            if incre_length > 50000:
                logger.debug('parses incre_length={}'.format(incre_length))
                self.del_lock(lock_uid, sec)
                return
            try:
                incre_sql = 'SELECT max(company_number) FROM company_11315 where {}'
                # select max(company_number) from company_11315 where company_name != 'NA'

                tablename_condition = self.get_configs_by_tablename(
                    'put_11315_increased')
                print(incre_sql.format(tablename_condition[0]))
                results = single_oracle.oracle_find_by_param_all(
                    incre_sql.format(tablename_condition[0]))
                print(results)
                if results[0][0]:
                    if tablename_condition[1] == 0:
                        sql = 'update config set over = 2 where id = {}'.format(
                            tablename_condition[2])
                        single_oracle.oracle_update(sql)

                    max_num = single_oracle.oracle_find_by_param_all(
                        'select max(company_number) from company_11315')

                    if max_num[0][0] > results[0][0]:
                        single_oracle.execute(
                            'delete from company_11315 where company_number > {}'
                            .format(results[0][0]))

                    # XXX: 需要判段什么时候抓取完毕
                    for i in range(results[0][0] + 1, results[0][0] + 2000000):
                        company_number = i
                        # print(company_number)
                        self.server.lpush('11315_increased', company_number)
                else:
                    single_oracle.oracle_update(
                        'update config set over = 1, over_time = SYSDATE where id = {}'
                        .format(tablename_condition[2]))

                self.del_lock(lock_uid, sec)
            except Exception as e:
                self.del_lock(lock_uid, sec)
                logger.exception(e)
Exemplo n.º 3
0
def business_term(res):
    for re in res:
        # print(re[0])

        if re[0] and re[0] != '-':
            term_l = re[0].split('至')
            # 营业期限自
            begin = term_l[0]
            # 营业期限至
            end = term_l[1]
            if end == '无固定期限':
                end = '2999-12-31'
            # print(begin, end)
        elif re[0] == '-':
            begin = '-'
            end = '-'
            # print(begin,end)

        sql = "update tyc_qybj_jbxx set business_term_begin ='{begin}', business_term_end = '{end}' where company_name='{name}'".format(
            begin=begin, end=end, name=re[1])
        print(sql)
        single_oracle.execute(sql)
def main(args):
    print(u'启动', args)
    # import sys
    #
    # reload(sys)
    # check_first_start=True
    flag = single_reids.server.get('11315_increased_flag')

    if not flag or int(flag.decode()) == 0:
        single_reids.put_flag()

        flag = single_reids.server.get('11315_increased_flag')
    while True:
        NA_count = single_reids.server.get('11315_NA_count')

        if NA_count and int(NA_count.decode()) >= int(flag.decode()):
            time.sleep(60 * 30)
            continue

        # i = single_reids.server.rpop('11315_omit')
        i = single_reids.server.rpop('11315_increased')

        #i=12872429
        print('-' * 20, i)
        if not i:
            # time.sleep(60*10)
            # single_reids.put_11315_omit()

            single_reids.put_11315_increased()
            continue
        Retry = 1
        i = i.decode()

        res = single_oracle.oracle_find_by_param_all(
            'select count(*) from company_11315 where company_number = {}'.
            format(i))
        if res[0][0] != 0:
            print('该id{}已存在'.format(i))
            single_oracle.execute(
                'delete from company_11315 where company_number = {}'.format(
                    i))

        url = "http://" + str(i) + ".11315.com/"
        print(url)

        # 一个公司连接最多访问10次
        while Retry <= 10:
            logger.info(url + "------%d", Retry)

            try:
                con = web_spider(url)
                # 对访问状态及页面内容进行初步判断
                if con.status_code == 200:
                    html = con.text
                    if "系统检测到您的请求存在异常" not in html and "你可能访问的太快了" not in html and "正在 请求 请 稍候" not in html:

                        try:
                            web_parse(html, i, url)
                        except Exception:
                            logger.info("此number号已存在: %s", i)
                        break
                    else:
                        logger.info("IP 被封或者访问过快")
                        Retry += 1
                        continue
                else:
                    logger.info("访问状态码异常%s", str(con.status_code))
                    Retry += 1
                    time.sleep(3)
                    continue
            except Exception as e:
                logger.info("访问超时或者连接出错")
                Retry += 1
                logger.exception(e)
                continue
            except AttributeError as w:
                logger.exception(w)
                logger.info("无法获取二次连接url")
                Retry += 1
                continue
            except Exception as w:
                logger.exception(w)
                logger.info("未知访问错误")
                logger.exception("Exception Logged")
                Retry += 1
                continue