def set_config(self): self.searcher = GsCrawler() self.searcher.dst_topic = "GsOnline" self.src_table = 'enterprise_credit_info.dtjk_company_src' self.pk_name = 'mc'
class DtjkUpdateJob(UpdateFromTable): online_province = [ u'上海市', u'北京市', u'广东省', u'江苏省', u'浙江省', u'山东省', u'河北省', u'福建省', u'天津市', u'湖北省', u'河南省', u'海南省', u'重庆市', u'贵州省', u'湖南省', u'陕西省', u'山西省', u'黑龙江省', u'吉林省', u'内蒙古自治区', u'广西壮族自治区', u'云南省', u'西藏自治区', u'工商总局', u'宁夏回族自治区', u'甘肃省', u'青海省', u'江西省', u'新疆维吾尔自治区', u'四川省', u'辽宁省', u'安徽省', ] def __init__(self): super(DtjkUpdateJob, self).__init__() # def set_config(self): # self.searcher = GsCrawler() # # self.searcher.dst_topic = 'GSCrawlerTest' # self.src_table = 'enterprise_credit_info.dtjk_company_src' # self.pk_name = 'mc' def set_config(self): self.searcher = GsCrawler() self.searcher.dst_topic = "GsCrawlerOnline" self.src_table = 'enterprise_credit_info.dtjk_company_src_renbao' self.pk_name = 'mc' def run(self): # cnt_0 = 0 # cnt_1 = 0 # cnt_2 = 0 # cnt_999 = 0 fail_dict = dict() update_result = {u'更新成功': 0, u'查无结果': 0, u'更新失败': 0, u'未上线': 0} while True: # print json.dumps(fail_dict, ensure_ascii=False) sql_1 = "select mc,province,xydm from " \ "(" \ "select * from %s where update_status=-1 order by last_update_time limit 30 " \ ") t " \ "order by RAND() limit 1" % self.src_table # print sql_1 res_1 = MySQL.execute_query(sql_1) if len(res_1) > 0: mc = res_1[0][0] province = res_1[0][1] xydm = res_1[0][2] print mc, province self.info(mc + '|' + province) sql_2 = "update %s set update_status=-2,last_update_time=now() " \ "where mc='%s'" \ % (self.src_table, mc) MySQL.execute_update(sql_2) try: if province in self.online_province: if province in ( u'河北省', u'宁夏回族自治区', u'河南省', u'海南省', u'重庆市', u'江西省', u'贵州省', u'湖南省', u'陕西省', u'山西省', u'黑龙江省', u'吉林省', u'内蒙古自治区', u'广西壮族自治区', u'云南省', u'西藏自治区', u'青海省', u'新疆维吾尔自治区', u'甘肃省', u'工商总局', u'浙江省', u'江苏省', u'广东省', u'上海市', # u'' ) and check(xydm): keyword = xydm else: keyword = mc update_status = self.searcher.crawl(keyword=keyword, province=province) else: update_status = 999 sql_3 = "update %s set update_status=%d, last_update_time=now() " \ "where mc='%s'" % \ (self.src_table, update_status, mc) if mc in fail_dict: fail_dict.pop(mc) except Exception, e: # traceback.print_exc(e) self.info(traceback.format_exc(e)) if fail_dict.get(mc, 0) > 10: update_status = 3 if mc in fail_dict: fail_dict.pop(mc) else: update_status = -1 fail_dict[mc] = fail_dict.get(mc, 0) + 1 # self.info(str(e)) sql_3 = "update %s set update_status=%d " \ "where mc='%s'" % \ (self.src_table, update_status, mc) self.searcher.delete_tag_a_from_db(mc, province) MySQL.execute_update(sql_3) # print 'update_status', update_status if update_status == 0: update_result[u'查无结果'] += 1 elif update_status == 1: update_result[u'更新成功'] += 1 elif update_status == 999: update_result[u'未上线'] += 1 else: update_result[u'更新失败'] += 1 self.info(json.dumps(update_result, ensure_ascii=False)) else: self.info(u'更新完毕') break
def set_config(self): self.searcher = GsCrawler() # self.searcher.dst_topic = 'GSCrawlerTest' self.searcher.dst_topic = 'GSCrawlerResultTest' self.src_table = 'enterprise_credit_info.dtjk_company_src_old' self.pk_name = 'mc'
class DtjkUpdateJobOld(UpdateFromTable): province = None def __init__(self, province): super(DtjkUpdateJobOld, self).__init__() self.province = province def set_config(self): self.searcher = GsCrawler() # self.searcher.dst_topic = 'GSCrawlerTest' self.searcher.dst_topic = 'GSCrawlerResultTest' self.src_table = 'enterprise_credit_info.dtjk_company_src_old' self.pk_name = 'mc' def run(self): cnt_0 = 0 cnt_1 = 0 cnt_2 = 0 while True: sql_1 = "select mc,province from " \ "(" \ "select * from %s where update_status=-1 " \ "and province='%s'" \ "limit 30 " \ ") t " \ "order by RAND() limit 1 " % (self.src_table, self.province) # print sql_1 res_1 = MySQL.execute_query(sql_1) if len(res_1) > 0: mc = res_1[0][0] province = res_1[0][1] self.info(mc + '|' + province) sql_2 = "update %s set update_status=-2 " \ "where mc='%s'" \ % (self.src_table, mc) MySQL.execute_update(sql_2) try: update_status = self.searcher.crawl(keyword=mc, province=province) sql_3 = "update %s set update_status=%d, last_update_time=now() " \ "where mc='%s'" % \ (self.src_table, update_status, mc) except Exception, e: traceback.print_exc(e) update_status = -1 self.info(str(e)) sql_3 = "update %s set update_status=%d,last_update_time=now() " \ "where mc='%s'" % \ (self.src_table, update_status, mc) self.searcher.delete_tag_a_from_db(mc, province) MySQL.execute_update(sql_3) if update_status == 0: cnt_0 += 1 elif update_status == 1: cnt_1 += 1 else: cnt_2 += 1 self.info(u'查询有结果: %d, 查询无结果: %d, 查询失败:%d' % (cnt_1, cnt_0, cnt_2)) else: self.info(u'更新完毕') break
def set_config(self): self.searcher = GsCrawler() self.searcher.dst_topic = "GsCrawlerOnline" self.searcher.dst_topic2 = 'GSCrawlerResultTest' self.src_table = 'enterprise_credit_info.changed_mc_src' self.pk_name = 'mc'
class UpdateNew(object): crawler = GsCrawler() kafka = None failed_times = 0 failed_pool = [] def __init__(self): self.init_kafka() self.crawler.set_app_key(app_key) # 使用key1 # self.crawler.crawler_class_dict[u'浙江省'] = ZheJiangSearcherQW # self.crawler.crawler_class_dict[u'北京市'] = BeiJingQW self.fill_failed_pool() def fill_failed_pool(self): sql = "select * from GsSrc.dbo.company_pool where datediff(second,add_time,getdate())>=60" res = MSSQL.execute_query(sql) for r in res: company_name = r[0] province = r[1] self.failed_pool.append((company_name, province)) delete_from_company_pool(company_name, province) def init_kafka(self): """ 初始化kafka客户端 :return: """ self.kafka = KafkaAPI('NewRegisteredCompany') self.kafka.init_producer() self.kafka.init_consumer('Crawler') def get_company_province(self): """ 从队列中获取未消费的公司名 :return: 队列中未消费的公司名 :rtype: unicode """ if len(self.failed_pool) > 0: ele = self.failed_pool.pop() return ele else: message = self.kafka.fetch_one() if message: json_text = message.value.decode('utf-8', 'ignore') partition = message.partition.id offset = message.offset json_obj = json.loads(json_text) company = json_obj['companyName'] province = json_obj['province'] print partition, offset, company return company, province else: return None, None def run(self): """ 执行更新任务 :return: """ company_name, province = self.get_company_province() while company_name: save_into_company_pool(company_name, province) print u'更新 %s %s' % (company_name, province) try: self.crawler.crawl(keyword=company_name, province=province) self.failed_times = 0 except Exception, e: traceback.print_exc(e) print u'更新出错,放回更新队列' keyword = company_name.replace('(', u'`(').replace(')', u')') delete_tag_a_from_db(keyword) self.kafka.send( json.dumps({ 'companyName': company_name, 'province': province })) self.failed_times += 1 if self.failed_times == 50: print u'连续失败50次,退出程序' break delete_from_company_pool(company_name, province) company_name, province = self.get_company_province() print u'更新完毕'