def cracker_search_list_content(self, keyword, item=None): url = 'http://{host}/index.jspx'.format(host=self.host) json_data, content = self.get_captcha_geetest_full( url, '#searchText', '#click', keyword, '#searchtips') if content is None: return None # 这里存储cookie cookie_list = json_data.get('cookies', None) if cookie_list is None: self.log.error('没有cookie信息, 保存cookie失败..') return content cookie = '' length = len(cookie_list) for index, it in enumerate(cookie_list): cookie += it['name'] + '=' + it['value'] if index != length - 1: cookie += '; ' if item is None: self.source_db.save( self.COOKIE_TABLE, { '_id': self.province, 'Cookie': cookie, 'in_time': util.get_now_time() }) else: item['Cookie'] = cookie item['in_time'] = util.get_now_time() self.source_db.save(self.COOKIE_TABLE, item) return content
def report_crawl_status(self, query_name, crawl_flag, start_schedule_time='', detail_name='', cur_time=None): if is_debug: return # 获得当前时间 time_key = self.crawl_flag + '_time' if cur_time is None: cur_time = util.get_now_time() crawl_status = { 'company': detail_name, self.crawl_flag: crawl_flag, time_key: cur_time, 'query': query_name, 'start_schedule_time': start_schedule_time, } parse_status = { 'company': detail_name, 'province': self.province, } # 反馈抓取情况 if not is_debug: self.report_mq_thread.push_msg(json.dumps(crawl_status)) # 如果抓取成功了, 反馈到解析模块 if crawl_flag == self.CRAWL_FINISH: if not is_debug: self.parse_mq_thread.push_msg(json.dumps(parse_status))
def main(): count = 0 log.info('开始读取数据...') source_table = 'offline_all_list' with open("guangdong.txt") as p_file: result_list = list() province = "guangdong" for line in p_file: company_name = line.strip().strip("\r").strip("\n") count += 1 data = { '_id': util.generator_id({}, company_name, province), 'company_name': company_name, 'province': province, 'in_time': util.get_now_time(), 'crawl_online': 0, } result_list.append(data) if len(result_list) >= 1000: source_db.insert_batch_data(source_table, result_list) del result_list[:] source_db.insert_batch_data(source_table, result_list) log.info("总共发送数据: {}".format(count)) log.info('数据发送完毕, 退出程序')
def set_crawl_flag(self, item, flag, cur_time=None): ''' { company:'企业名称', crawl_online: -2 公司名称太短, -1 公司名称不符合规格, 0 代表抓取失败 1 代表已经抓完了 2 剔除搜索过没有信息的关键字 3 代表当前关键字搜索出列表结果 但是没有找到完整匹配关键字 crawl_online_time: '2017-02-22 19:03:43', query:'企业名称', } ''' # 获得当前时间 time_key = self.crawl_flag + '_time' if cur_time is None: cur_time = util.get_now_time() # 更新字段信息 item[time_key] = cur_time item[self.crawl_flag] = flag # 更新信息 调试模式下不存储数据 if not is_debug: self.source_db.save(self.source_table, item)
def query_company(self, item): # 先判断抓取页面信息中是否有无效拦截页面 self.filter_captcha_page(item) # 获得企业名称 company = item.get('_id') u_time = _in_time = util.get_now_time() province = self.province_py_to_zh[self.province] in_time = item.get('in_time') if in_time is None: in_time = _in_time # 获得抓取时间戳 in_time = util.get_change_stamp(in_time) data_list = item.get('datalist') if data_list is None: self.log.info('没有datalist: province = {province} company = {company}'.format( company=company, province=self.province)) return self.CRAWL_NOTHING_FIND base_info = data_list.get(Model.base_info) if base_info is None: self.log.error('没有基本信息: province = {province} company = {company}'.format( company=company, province=self.province)) return self.CRAWL_UN_FINISH base_info_url = self.__get_base_info_url(base_info) # 解析工商 gs_flag = self.parse_gs_info(company, data_list, u_time, _in_time, province, in_time, base_info_url, item) # 解析年报 nb_flag = self.parse_nb_info(company, data_list, u_time, _in_time, province, in_time, base_info_url) if gs_flag and nb_flag: return self.success_flag if not gs_flag and not nb_flag: self.log.error('工商信息与年报都解析失败: province = {province} company = {company}'.format( company=company, province=self.province)) return self.CRAWL_UN_FINISH if not gs_flag: self.log.error('工商信息解析失败: province = {province} company = {company}'.format( company=company, province=self.province)) return self.CRAWL_UN_FINISH if not nb_flag: self.log.error('年报解析失败: province = {province} company = {company}'.format( company=company, province=self.province)) return self.CRAWL_UN_FINISH return self.success_flag
def main(): log.info('开始读取数据...') source_table = 'offline_all_list' app_data_table = 'enterprise_data_gov' with open("company_expection_list") as p_file: result_list = list() for line in p_file: company_name = line.strip().strip("\r").strip("\n") province = 'gsxt' item = app_data_db.find_one(app_data_table, {'company': company_name}) if item is not None and 'province' in item and item['province'] in province_zh_to_py: province = province_zh_to_py[item['province']] else: log.error("省份查找失败: {}".format(company_name)) data = { '_id': util.generator_id({}, company_name, province), 'company_name': company_name, 'province': province, 'in_time': util.get_now_time(), } result_list.append(data) if len(result_list) >= 1000: source_db.insert_batch_data(source_table, result_list) del result_list[:] source_db.insert_batch_data(source_table, result_list) log.info('数据发送完毕, 退出程序')
def main(): start_time = time.time() # find_task(target_db_new, 'new') find_task(target_db_old, 'old') end_time = time.time() send_email( mail_from_addr, mail_password, mail_to_addrs, '甘肃股权质押属性查找完成 - %s' % get_now_time(), MIMEText('耗时: {} s'.format(end_time - start_time), 'plain', 'utf-8'))
def get_model(self, _id, seed, search_name, province, data_list=None): if data_list is None: data_list = {} data = dict(_id=_id, seed=seed, search_name=search_name, province=province, datalist=data_list, in_time=util.get_now_time()) data[self.crawl_flag] = self.CRAWL_UN_FINISH data[self.ERROR_TIMES] = 0 return data
def main(): count = 0 log.info('开始清洗数据') source_table = 'cs2_online_all_search' # 获得传入的表信息 if len(sys.argv) > 1: source_table = sys.argv[1] source_table_cursor = source_db.db[source_table].find( { 'priority': { '$ne': 0 } }, no_cursor_timeout=True).batch_size(10000) for item in source_table_cursor: try: count += 1 search_name = item.get('search_name', None) company_name = item.get('company_name', None) if search_name is None or company_name is None: log.error('读取数据出错: item = {item}'.format(item=item)) continue if 'priority' not in item: log.error('没有priority字段: search_name = {name}'.format( name=search_name)) continue replace_name_1 = company_name.replace('(', '(').replace(')', ')') replace_name_2 = company_name.replace('(', '(').replace(')', ')') if search_name == replace_name_1 \ or search_name == replace_name_2 \ or search_name == company_name: item['priority'] = 0 item['in_time'] = util.get_now_time() source_db.insert_batch_data(source_table, [item]) # source_db.save(source_table, item) log.info( '更新数据: search_name = {search_name} company_name = {company_name}' .format(search_name=search_name, company_name=company_name)) log.info('当前进度: count = {count}'.format(count=count)) except Exception as e: log.exception(e) source_table_cursor.close() log.info('清洗数据完成, 退出程序')
def check_detail_crawl_flag(self, item): # 如果是调试模式则直接返回,不校验状态 if is_debug: return True # 如果没有抓取记录, 则需要抓取 if self.crawl_flag not in item: return True # 如果连时间字段都没有, 则说明还没有抓过 time_key = self.crawl_flag + '_time' pre_time = item.get(time_key, None) if pre_time is None: return True # 获得抓取标记 flag = item.get(self.crawl_flag) # 如果关键字不合法 也不需要再搜索 if flag == self.CRAWL_INVALID_NAME: return False # 关键字信息异常 不需要再次搜索 if flag == self.CRAWL_SHORT_NAME: return False # 如果时间超过过期时间 则需要重新抓取 cur_time = util.get_now_time() if util.sub_time(cur_time, pre_time) >= self.retention_time: self.log.info( 'cur_time = {cur} pre_time = {pre} _id = {_id}'.format( cur=cur_time, pre=pre_time, _id=item.get('_id', ''))) return True # 如果在过期时间内已经完成抓取, 则不再抓取 if flag == self.CRAWL_FINISH: return False # 过期时间内搜索没有数据的 也不需要再搜索 if flag == self.CRAWL_NOTHING_FIND: return False # 如果抓取失败 且失败次数达到最大次数 则不再进行抓取 if flag == self.CRAWL_UN_FINISH: if self.ERROR_TIMES in item: if item[self.ERROR_TIMES] >= self.MAX_ERROR_TIMES: return False return True
def main(): try: count = 0 all_count = 0 log.info('开始读取数据...') source_table = 'offline_all_list' cursor = db_query['offline_crawl_data_registration_company'].find({}) result_list = list() for element in cursor: try: all_count += 1 company_name = element['company'] if ' ' in company_name: query_result = db_query_company_data[ 'offline_all_list'].find_one( {'company_name': company_name}) if query_result: db_query_company_data['offline_all_list'].delete_one( {'company_name': company_name}) else: continue province = province_zh_to_py.get(str(element['province']), '') if province == '': log.info("province error: {}".format(element['_id'])) continue count += 1 company_name = str(company_name).strip() data = { '_id': util.generator_id({}, company_name, province), 'company_name': str(company_name).strip(), 'province': province, 'in_time': util.get_now_time(), 'crawl_online': 0, } mod_value = {'deal_flag': 1} db_query['offline_crawl_data_registration_company'].update( {'_id': element['_id']}, {"$set": mod_value}) log.info("已处理数据量: {}".format(all_count)) result_list.append(data) if len(result_list) >= 100: source_db.insert_batch_data(source_table, result_list) del result_list[:] except Exception, e: print e.message source_db.insert_batch_data(source_table, result_list) log.info("总共发送数据: {}".format(count)) log.info('数据发送完毕, 退出程序') time.sleep(100)
def main(): try: count = 0 log.info('开始读取数据...') source_table = 'offline_all_list' cursor = db_query['guangdong_baseinfo_0912'].find({}) result_list = list() for element in cursor: try: company_name = element['_id'] if '公司' not in company_name: continue query = db_query_app_data['enterprise_data_gov'].find_one( {'company': company_name}) if query: continue query = source_db.find_one('offline_all_list', {'company_name': company_name}) if query: continue province = 'guangdong' count += 1 data = { '_id': util.generator_id({}, company_name, province), 'company_name': company_name, 'province': province, 'in_time': util.get_now_time(), 'crawl_online': 0, } mod_value = {'deal_flag': 1} db_query['guangdong_baseinfo_0912'].update( {'_id': element['_id']}, {"$set": mod_value}) result_list.append(data) log.info("当前总计发送数据: {}".format(count)) if len(result_list) >= 100: source_db.insert_batch_data(source_table, result_list) del result_list[:] except Exception, E: print E.message source_db.insert_batch_data(source_table, result_list) log.info("总共发送数据: {}".format(count)) log.info('数据发送完毕, 退出程序') time.sleep(100)
def main(): try: count = 0 log.info('开始读取数据...') source_table = 'offline_all_list' cursor = db_query_app_data['enterprise_data_gov'].find({}) result_list = list() for element in cursor: try: company_name = element['company'] query = source_db.find_one('offline_all_list', {'company_name': company_name}) if query: continue if '公司' not in company_name and 'enterprise_type' not in element[ 'enterprise_type']: continue province = province_zh_to_py.get(str(element['province']), '') if province == '': log.info("province error: {}".format(element['_id'])) continue count += 1 data = { '_id': util.generator_id({}, company_name, province), 'company_name': company_name, 'province': province, 'in_time': util.get_now_time(), 'crawl_online': 0, } result_list.append(data) log.info("当前总计发送数据: {}".format(count)) if len(result_list) >= 100: source_db.insert_batch_data(source_table, result_list) del result_list[:] except Exception, E: print E.message source_db.insert_batch_data(source_table, result_list) log.info("总共发送数据: {}".format(count)) log.info('数据发送完毕, 退出程序') time.sleep(100)
def set_crawl_flag(self, item, crawl_flag, flag, choose_db_type): if is_debug: return _id = item.get('_id') # 获得当前时间 cur_time = util.get_now_time() time_key = crawl_flag + '_time' # 如果错误 则记录错误次数 if flag == self.success_flag: error_times = 0 else: if self.ERROR_TIMES not in item: error_times = 1 else: error_times = item[self.ERROR_TIMES] + 1 # 打上从未成功过标签 if error_times >= self.MAX_ERROR_TIMES: flag = self.never_success_flag # 更新信息 if choose_db_type == CHOOSE_DB_OLD: # self.webpage_db_old.save(self.source_table, item) self.webpage_db_old.update(self.source_table, {'_id': _id}, {"$set": { time_key: cur_time, crawl_flag: flag, self.ERROR_TIMES: error_times, }}) elif choose_db_type == CHOOSE_DB_NEW: # self.webpage_db_new.save(self.source_table, item) self.webpage_db_new.update(self.source_table, {'_id': _id}, {"$set": { time_key: cur_time, crawl_flag: flag, self.ERROR_TIMES: error_times, }})
def set_detail_crawl_flag(self, item, flag): # 获得当前时间 cur_time = util.get_now_time() time_key = self.crawl_flag + '_time' # 更新字段信息 item[time_key] = cur_time item[self.crawl_flag] = flag # 如果错误 则记录错误次数 if flag == self.CRAWL_UN_FINISH: if self.ERROR_TIMES in item: item[self.ERROR_TIMES] += 1 else: item[self.ERROR_TIMES] = 1 else: item[self.ERROR_TIMES] = 0 # 更新信息 if not is_debug: self.source_db.save(self.source_table, item)
def main(): while True: try: count = 0 log.info('开始读取数据...') source_table = 'offline_all_list' cursor = db_query['offline_crawl_data_registration_company'].find( {'deal_flag': 0}) result_list = list() for element in cursor: company_name = element['company'] province = province_zh_to_py.get(str(element['province']), '') if province == '': log.info("province error: {}".format(element['_id'])) continue count += 1 data = { '_id': util.generator_id({}, company_name, province), 'company_name': company_name, 'province': province, 'in_time': util.get_now_time(), 'crawl_online': 0, } mod_value = {'deal_flag': 1} db_query['offline_crawl_data_registration_company'].update( {'_id': element['_id']}, {"$set": mod_value}) result_list.append(data) if len(result_list) >= 100: source_db.insert_batch_data(source_table, result_list) del result_list[:] source_db.insert_batch_data(source_table, result_list) log.info("总共发送数据: {}".format(count)) log.info('数据发送完毕, 退出程序') time.sleep(100) except Exception, e: log.info(e.message) time.sleep(10)
def main(): result_list = [] source_table = 'enterprise_data_gov' count = 0 deal_total = 0 for item in source_db.traverse_batch(source_table): item['_utime'] = util.get_now_time() _record_id = item.get('_record_id', '') company = item.get('company', '') count += 1 src_list = item.get('_src') if not isinstance(src_list, list): log.info("没有_src {record} {company}".format(record=_record_id, company=company)) continue if len(src_list) <= 10: continue temp_list = [] temp_list.extend(src_list[0:10]) item['_src'] = temp_list deal_total += 1 result_list.append(item) if len(result_list) >= 1000: source_db.insert_batch_data(source_table, result_list) del result_list[:] log.info("当前扫描数目: count = {}".format(count)) log.info("当前处理的数目: deal = {}".format(deal_total)) source_db.insert_batch_data(source_table, result_list) log.info("当前扫描数目: count = {}".format(count)) log.info("当前处理的数目: deal = {}".format(deal_total))
def report_crawl_fail(self, item): _id = item.get('_id') search_name = item.get('search_name') if search_name is None: search_name = _id self.log.info('search_name is None: _id = {_id}'.format(_id=_id)) # 判断是否由权限反馈到搜索列表 if (self.report_status & self.REPORT_SEARCH) > 0: result_item = self.company_data_db.find_one(self.online_all_search, {'search_name': search_name, 'province': self.province}) if result_item is not None: result_item[self.crawl_flag] = 0 self.company_data_db.save(self.online_all_search, result_item) self.log.info('save online_all_search success {com}'.format(com=search_name)) return # 判断是否由权限反馈到种子列表 if (self.report_status & self.REPORT_SEED) > 0: result_item = self.company_data_db.find_one(self.offline_all_list, {'company_name': _id, 'province': self.province}) if result_item is not None: result_item[self.crawl_flag] = 0 self.company_data_db.save(self.offline_all_list, result_item) self.log.info('save offline_all_list success {com}'.format(com=_id)) return data = { '_id': util.generator_id({}, _id, self.province), 'company_name': _id, 'province': self.province, 'in_time': util.get_now_time(), self.crawl_flag: 0, } self.company_data_db.insert_batch_data(self.offline_all_list, [data]) self.log.info('insert new company = {company}'.format(company=_id))
def task_run(self): result_list = [] # 创建协程池 if not is_debug: self.pool = gevent.pool.Pool(self.thread_num) else: self.pool = ThreadPool(processes=self.thread_num) self.log.info('当前开启协程数目: thread_num = {num}'.format(num=self.thread_num)) self.log.info('province: {province}服务已开启, 等待消费数据'.format(province=self.province)) # 创建线程池 count = 0 start_run_time = time.time() while True: if not is_running: break job = self.beanstalk.reserve(self.tube, 3) if job is not None: count += 1 body = job.body job.delete() self.log.info('当前消费数据索引: {count}'.format(count=count)) json_data = util.json_loads(body) if json_data is None: self.log.error('数据格式错误: msg = {msg}'.format(msg=body)) time.sleep(5) continue province = json_data.get('province') if province is None or province == '': self.log.error('没有province: {msg}'.format(msg=body)) continue company_name = json_data.get('company_name') unified_social_credit_code = json_data.get('unified_social_credit_code') start_schedule_time = json_data.get('start_schedule_time', '') if company_name is None and unified_social_credit_code is None: self.log.error('没有company 与 unified_social_credit_code: {msg}'.format(msg=body)) continue if company_name is not None and company_name == '': self.log.error('company = 空字符串, data = {data}'.format( data=body)) continue if unified_social_credit_code is not None and unified_social_credit_code == '': self.log.error('unified_social_credit_code = 空字符串, data = {data}'.format( data=body)) continue if province != self.province: self.log.warn('province 不正确: province = {province} data = {body}'.format( province=self.province, body=body)) continue if company_name is not None: self.log.info('当前消费数据为: province = {province} company = {company}'.format( province=province, company=company_name)) elif unified_social_credit_code is not None: self.log.info('当前消费数据为: province = {province} unified_social_credit_code = {code}'.format( province=province, code=unified_social_credit_code)) # 优先使用企业名单 if company_name is not None: data = { '_id': util.generator_id({}, company_name, province), 'company_name': company_name, 'province': province, 'in_time': util.get_now_time(), 'start_schedule_time': start_schedule_time, } else: data = { '_id': util.generator_id({}, unified_social_credit_code, province), 'unified_social_credit_code': unified_social_credit_code.strip().upper(), 'province': province, 'in_time': util.get_now_time(), 'start_schedule_time': start_schedule_time, } pool_result = self.pool.apply_async(self.worker_list[self.province].query_online_task, args=(data,)) result_list.append(pool_result) if len(result_list) >= 1000: for result in result_list: result.get() del result_list[:] # 如果达到最大运行时间 则重启服务 run_time = time.time() if int(run_time) - int(start_run_time) >= self.MAX_RUN_TIME: break if is_debug: self.pool.close() self.pool.join() for result in result_list: result.get() del result_list[:] del result_list self.log.info('收到退出信号, 安全退出...')
def main(): log.info('开始读取数据...') source_table = 'zhuxiao_diaoxiao_company' target_table = 'offline_all_list' source_table_curse = source_db.db[source_table].find({}, ['_id', 'province', 'registered_code', 'unified_social_credit_code'], no_cursor_timeout=True).batch_size(10000) cnt = 0 insert_list = [] count = 0 real_insert_cnt = 0 for item in source_table_curse: count += 1 company_name = item.get('_id') if company_name is None: continue province = item.get('province') # if province is not None: # log.info('province = {province}'.format(province=province)) registered_code = item.get('registered_code') # if registered_code is not None: # log.info('registered_code = {registered_code}'.format(registered_code=registered_code)) unified_social_credit_code = item.get('unified_social_credit_code') # if unified_social_credit_code is not None: # log.info('unified_social_credit_code = {unified_social_credit_code}'.format( # unified_social_credit_code=unified_social_credit_code)) province = get_province(province, registered_code, unified_social_credit_code) if province is None: log.error('计算省份信息失败: company = {company}'.format(company=company_name)) continue data = { '_id': util.generator_id({}, company_name, province), 'company_name': company_name, 'province': province, 'in_time': util.get_now_time(), } insert_list.append(data) cnt += 1 real_insert_cnt += 1 if cnt >= 10000: target_db.insert_batch_data(target_table, insert_list, insert=True) cnt = 0 del insert_list[:] log.info('insert 10000') log.info('当前进度: count = {count} company = {company}'.format( count=count, company=company_name)) if len(insert_list) > 0: target_db.insert_batch_data(target_table, insert_list, insert=True) log.info('insert last data') source_table_curse.close() log.info('总共插入数据为: {cnt}'.format(cnt=real_insert_cnt)) log.info('数据发送完毕, 退出程序')
def save_search_list(self, company, code, param_list): match_param = None if self.search_table is None: return param_list, match_param rank = 1 data_list = [] for param in param_list: # 必须要有列表名 才进行存储 search_name = param.get('search_name') if search_name is None: continue # 取得解析出的统一社会信用号代码信息 unified_social_credit_code = param.get( 'unified_social_credit_code') # 不在参数中存储统一社会信用号 if 'unified_social_credit_code' in param: unified_social_credit_code = unified_social_credit_code.strip( ).upper() param.pop('unified_social_credit_code') if company is not None: replace_name_1 = company.replace('(', '(').replace(')', ')') replace_name_2 = company.replace('(', '(').replace(')', ')') else: replace_name_1 = '' replace_name_2 = '' # 确定优先级, 如果种子名称跟列表名称一样 则优先级最高为 0 if search_name == company \ or search_name == replace_name_1 \ or search_name == replace_name_2 \ or (code == unified_social_credit_code and code is not None): priority = 0 else: priority = 1 data = { # 以搜索列表名与省份信息作为唯一主键 '_id': util.generator_id({'priority': priority}, search_name, self.province), 'search_name': search_name, 'province': self.province, 'in_time': util.get_now_time(), 'param': param, 'rank': rank, 'priority': priority, self.ERROR_TIMES: 0, } # 加入注册码 if unified_social_credit_code is not None: data['unified_social_credit_code'] = unified_social_credit_code # 添加搜索种子信息 if company is not None: data['company_name'] = company if code is not None: data['seed_code'] = code # 如果是完全匹配则重置抓取状态信息 if priority == 0: data[self.crawl_flag] = 0 match_param = param.copy() data_list.append(data) rank += 1 # 调试模式下不实际插入数据 #if not is_debug: self.source_db.insert_batch_data(self.search_table, data_list) return param_list, match_param