Exemplo n.º 1
0
 def set_config(self):
     self.searcher = GsCrawler()
     self.searcher.dst_topic = "GsOnline"
     self.src_table = 'enterprise_credit_info.dtjk_company_src'
     self.pk_name = 'mc'
Exemplo n.º 2
0
class DtjkUpdateJob(UpdateFromTable):
    online_province = [
        u'上海市',
        u'北京市',
        u'广东省',
        u'江苏省',
        u'浙江省',
        u'山东省',
        u'河北省',
        u'福建省',
        u'天津市',
        u'湖北省',
        u'河南省',
        u'海南省',
        u'重庆市',
        u'贵州省',
        u'湖南省',
        u'陕西省',
        u'山西省',
        u'黑龙江省',
        u'吉林省',
        u'内蒙古自治区',
        u'广西壮族自治区',
        u'云南省',
        u'西藏自治区',
        u'工商总局',
        u'宁夏回族自治区',
        u'甘肃省',
        u'青海省',
        u'江西省',
        u'新疆维吾尔自治区',
        u'四川省',
        u'辽宁省',
        u'安徽省',
    ]

    def __init__(self):
        super(DtjkUpdateJob, self).__init__()

    # def set_config(self):
    #     self.searcher = GsCrawler()
    #     # self.searcher.dst_topic = 'GSCrawlerTest'
    #     self.src_table = 'enterprise_credit_info.dtjk_company_src'
    #     self.pk_name = 'mc'

    def set_config(self):
        self.searcher = GsCrawler()
        self.searcher.dst_topic = "GsCrawlerOnline"
        self.src_table = 'enterprise_credit_info.dtjk_company_src_renbao'
        self.pk_name = 'mc'

    def run(self):
        # cnt_0 = 0
        # cnt_1 = 0
        # cnt_2 = 0
        # cnt_999 = 0
        fail_dict = dict()
        update_result = {u'更新成功': 0, u'查无结果': 0, u'更新失败': 0, u'未上线': 0}

        while True:
            # print json.dumps(fail_dict, ensure_ascii=False)
            sql_1 = "select mc,province,xydm from " \
                    "(" \
                    "select * from %s where update_status=-1 order by last_update_time limit 30 " \
                    ") t " \
                    "order by RAND() limit 1" % self.src_table
            # print sql_1
            res_1 = MySQL.execute_query(sql_1)
            if len(res_1) > 0:
                mc = res_1[0][0]
                province = res_1[0][1]
                xydm = res_1[0][2]
                print mc, province
                self.info(mc + '|' + province)
                sql_2 = "update %s set update_status=-2,last_update_time=now() " \
                        "where mc='%s'" \
                        % (self.src_table, mc)
                MySQL.execute_update(sql_2)
                try:
                    if province in self.online_province:
                        if province in (
                                u'河北省',
                                u'宁夏回族自治区',
                                u'河南省',
                                u'海南省',
                                u'重庆市',
                                u'江西省',
                                u'贵州省',
                                u'湖南省',
                                u'陕西省',
                                u'山西省',
                                u'黑龙江省',
                                u'吉林省',
                                u'内蒙古自治区',
                                u'广西壮族自治区',
                                u'云南省',
                                u'西藏自治区',
                                u'青海省',
                                u'新疆维吾尔自治区',
                                u'甘肃省',
                                u'工商总局',
                                u'浙江省',
                                u'江苏省',
                                u'广东省',
                                u'上海市',
                                # u''
                        ) and check(xydm):
                            keyword = xydm
                        else:
                            keyword = mc
                        update_status = self.searcher.crawl(keyword=keyword,
                                                            province=province)
                    else:
                        update_status = 999
                    sql_3 = "update %s set update_status=%d, last_update_time=now() " \
                            "where mc='%s'" % \
                            (self.src_table, update_status, mc)
                    if mc in fail_dict:
                        fail_dict.pop(mc)
                except Exception, e:
                    # traceback.print_exc(e)
                    self.info(traceback.format_exc(e))
                    if fail_dict.get(mc, 0) > 10:
                        update_status = 3
                        if mc in fail_dict:
                            fail_dict.pop(mc)
                    else:
                        update_status = -1
                        fail_dict[mc] = fail_dict.get(mc, 0) + 1
                    # self.info(str(e))
                    sql_3 = "update %s set update_status=%d " \
                            "where mc='%s'" % \
                            (self.src_table, update_status, mc)
                    self.searcher.delete_tag_a_from_db(mc, province)
                MySQL.execute_update(sql_3)
                # print 'update_status', update_status
                if update_status == 0:
                    update_result[u'查无结果'] += 1
                elif update_status == 1:
                    update_result[u'更新成功'] += 1
                elif update_status == 999:
                    update_result[u'未上线'] += 1
                else:
                    update_result[u'更新失败'] += 1
                self.info(json.dumps(update_result, ensure_ascii=False))
            else:
                self.info(u'更新完毕')
                break
Exemplo n.º 3
0
 def set_config(self):
     self.searcher = GsCrawler()
     # self.searcher.dst_topic = 'GSCrawlerTest'
     self.searcher.dst_topic = 'GSCrawlerResultTest'
     self.src_table = 'enterprise_credit_info.dtjk_company_src_old'
     self.pk_name = 'mc'
Exemplo n.º 4
0
class DtjkUpdateJobOld(UpdateFromTable):

    province = None

    def __init__(self, province):
        super(DtjkUpdateJobOld, self).__init__()
        self.province = province

    def set_config(self):
        self.searcher = GsCrawler()
        # self.searcher.dst_topic = 'GSCrawlerTest'
        self.searcher.dst_topic = 'GSCrawlerResultTest'
        self.src_table = 'enterprise_credit_info.dtjk_company_src_old'
        self.pk_name = 'mc'

    def run(self):
        cnt_0 = 0
        cnt_1 = 0
        cnt_2 = 0
        while True:
            sql_1 = "select mc,province from " \
                    "(" \
                    "select * from %s where update_status=-1 " \
                    "and province='%s'" \
                    "limit 30 " \
                    ") t " \
                    "order by RAND() limit 1 " % (self.src_table, self.province)
            # print sql_1
            res_1 = MySQL.execute_query(sql_1)
            if len(res_1) > 0:
                mc = res_1[0][0]
                province = res_1[0][1]
                self.info(mc + '|' + province)
                sql_2 = "update %s set update_status=-2 " \
                        "where mc='%s'" \
                        % (self.src_table, mc)
                MySQL.execute_update(sql_2)
                try:
                    update_status = self.searcher.crawl(keyword=mc,
                                                        province=province)
                    sql_3 = "update %s set update_status=%d, last_update_time=now() " \
                            "where mc='%s'" % \
                            (self.src_table, update_status, mc)
                except Exception, e:
                    traceback.print_exc(e)
                    update_status = -1
                    self.info(str(e))
                    sql_3 = "update %s set update_status=%d,last_update_time=now() " \
                            "where mc='%s'" % \
                            (self.src_table, update_status, mc)
                    self.searcher.delete_tag_a_from_db(mc, province)
                MySQL.execute_update(sql_3)
                if update_status == 0:
                    cnt_0 += 1
                elif update_status == 1:
                    cnt_1 += 1
                else:
                    cnt_2 += 1
                self.info(u'查询有结果: %d, 查询无结果: %d, 查询失败:%d' %
                          (cnt_1, cnt_0, cnt_2))
            else:
                self.info(u'更新完毕')
                break
 def set_config(self):
     self.searcher = GsCrawler()
     self.searcher.dst_topic = "GsCrawlerOnline"
     self.searcher.dst_topic2 = 'GSCrawlerResultTest'
     self.src_table = 'enterprise_credit_info.changed_mc_src'
     self.pk_name = 'mc'
Exemplo n.º 6
0
class UpdateNew(object):

    crawler = GsCrawler()
    kafka = None
    failed_times = 0
    failed_pool = []

    def __init__(self):
        self.init_kafka()
        self.crawler.set_app_key(app_key)  # 使用key1
        # self.crawler.crawler_class_dict[u'浙江省'] = ZheJiangSearcherQW
        # self.crawler.crawler_class_dict[u'北京市'] = BeiJingQW
        self.fill_failed_pool()

    def fill_failed_pool(self):
        sql = "select * from GsSrc.dbo.company_pool where datediff(second,add_time,getdate())>=60"
        res = MSSQL.execute_query(sql)
        for r in res:
            company_name = r[0]
            province = r[1]
            self.failed_pool.append((company_name, province))
            delete_from_company_pool(company_name, province)

    def init_kafka(self):
        """
        初始化kafka客户端
        :return:
        """
        self.kafka = KafkaAPI('NewRegisteredCompany')
        self.kafka.init_producer()
        self.kafka.init_consumer('Crawler')

    def get_company_province(self):
        """
        从队列中获取未消费的公司名
        :return: 队列中未消费的公司名
        :rtype: unicode
        """
        if len(self.failed_pool) > 0:
            ele = self.failed_pool.pop()
            return ele
        else:
            message = self.kafka.fetch_one()
            if message:
                json_text = message.value.decode('utf-8', 'ignore')
                partition = message.partition.id
                offset = message.offset
                json_obj = json.loads(json_text)
                company = json_obj['companyName']
                province = json_obj['province']
                print partition, offset, company
                return company, province
            else:
                return None, None

    def run(self):
        """
        执行更新任务
        :return:
        """
        company_name, province = self.get_company_province()
        while company_name:
            save_into_company_pool(company_name, province)
            print u'更新 %s %s' % (company_name, province)
            try:
                self.crawler.crawl(keyword=company_name, province=province)
                self.failed_times = 0
            except Exception, e:
                traceback.print_exc(e)
                print u'更新出错,放回更新队列'
                keyword = company_name.replace('(', u'`(').replace(')', u')')
                delete_tag_a_from_db(keyword)
                self.kafka.send(
                    json.dumps({
                        'companyName': company_name,
                        'province': province
                    }))

                self.failed_times += 1
                if self.failed_times == 50:
                    print u'连续失败50次,退出程序'
                    break
            delete_from_company_pool(company_name, province)
            company_name, province = self.get_company_province()
        print u'更新完毕'