def put(self, data, sleep_time=10, retry=9, email_list=functions.mailto_list_ourselves): i = 0 count = retry + 1 ssdb_queue = None while True: try: if data != None and ssdb_queue != None: if isinstance(data, dict): return ssdb_queue.put_data(pickle.dumps(data)) else: if isinstance(data, dict): return self.put_data(pickle.dumps(data)) except Exception as e: print u'插入队列异常 %s' % exceputil.traceinfo(e) i += 1 if i >= count: #发邮件 functions.send_mail_old( email_list, u"ssdb队列更新异常", u"错误信息%s \nqueue_name:%s" % (exceputil.traceinfo(e), self.queue_name)) time.sleep(600) i = 0 else: time.sleep(sleep_time) ssdb_queue = getSSDBQueuev2(self.queue_name, self.host, self.port, self.max_connections, self.timeout)
def get(self, sleep_time=10, retry=9, email_list=functions.mailto_list_ourselves): i = 0 count = retry + 1 ssdb_queue = None while True: try: if ssdb_queue != None: return ssdb_queue.get_data() else: return self.get_data() except Exception as e: print u'查询数据异常 %s' % exceputil.traceinfo(e) i += 1 if i >= count: #发邮件 functions.send_mail_old(email_list, u"ssdb队列更新异常", u"错误信息%s" % exceputil.traceinfo(e)) time.sleep(600) i = 0 else: time.sleep(sleep_time) ssdb_queue = getSSDBQueuev2(self.queue_name, self.host, self.port, self.max_connections, self.timeout)
def start(self): """ 启动消费者 1。把上次处理失败的last_key文件内容,并放到redis队列末尾 2。获取新的key值 3。根据新的key值抓取。如果失败,则把失败的key放到last_file文件中 4。回到第2步 :return: """ self.process_last_error() while True: try: item = self.get() if item == None or len(item) < 1: self.customer_logging.error(u"队列内容为空") continue except Exception as e: self.customer_logging.error(u"读取redis集合错误,错误信息:%s" % (traceinfo(e))) time.sleep(10) continue fileutil.write(self.last_file, item.encode("UTF-8")) try: self.run(item) fileutil.clear(self.last_file) except Exception as e2: self.customer_logging.error(u"抓取异常,错误信息:%s" % (traceinfo(e2))) self.customer_queue_conn.putv2(item, self.customer_queue_name) fileutil.clear(self.last_file) time.sleep(10) time.sleep(1)
def product(self, item, unique=None): """ 生产url,放到redis队列,对唯一性进行判断 :param item: (str) url或种子 :return:(None) """ if self.bloomfilter_mode: if unique == None: unique = item while True: try: if self.bloomFilterclient.insert_if_not_exists(unique): self.product_logging.info(u"此元素未处理,item:%s" % unique) self.product_real(item) else: self.product_logging.info(u"此元素已处理,item:%s" % unique) break except Exception as e: self.product_logging.error(u"访问布隆过滤器发生错误!,错误信息:%s" % traceinfo(e)) while True: try: self.bloomFilterclient = BloomFilterClient( self.bf_host, self.bf_port) break except Exception as e1: self.product_logging.error( u"布隆过滤器连接发生错误!,错误信息:%s" % traceinfo(e1)) time.sleep(5) continue else: self.product_real(item)
def getSSDBQueuev2(queue_name, host='127.0.0.1', port=8888, max_connections=1, timeout=30, retry=9, sleep_time=30, email_list=functions.mailto_list_ourselves): i = 0 count = retry + 1 while True: try: return getSSDBQueue(queue_name, host=host, port=port, max_connections=max_connections, timeout=timeout) except Exception as e: print exceputil.traceinfo(e) i += 1 if i > count: #发邮件 functions.send_mail_old(email_list, u"redis队列连接异常", u"错误信息%s" % exceputil.traceinfo(e)) time.sleep(600) i = 0 else: time.sleep(sleep_time)
def main_(): dict_ = {u"北京": 30000, u"上海": 30000, u"江苏": 20000, u"浙江": 20000} f = open("D:/hehongjing/xxxx.json", "w") for dd in dict_: print dd table = mongoutil.getmondbbyhost("bigdata_higgs", "qyxx").table tables = table.find({"type": dd}).limit(dict_[dd]) for tt in tables: try: if dd == u"上海": for _html in ["other_html", "company_html"]: if _html in tt: del tt[_html] if dd == u"浙江": for _html in ["baxx_html"]: if _html in tt: del tt[_html] jj = json.dumps(tt) jj = jj.replace(u"xa0", "").replace("ue001", "").replace( "ue00b", "").replace("\\\\", "\\") #js= jj.decode('raw_unicode_escape').encode("UTF-8") js = jj.decode('raw_unicode_escape').decode("UTF-8").encode( "GBK") f.write(js + "\n") except Exception as e: exceputil.traceinfo(e) f.close() print "end"
def put_proxy_into_queue_or_set(self,type='queue'): try: if type == "queue": return self.proxy_white_list_db.put_data_back(self.proxy) else: return self.proxy_black_list_db.ssdb_put_zset(self.proxy,score=int(time.time())) except Exception as e: exceputil.traceinfo(e)
def write_file(open_file,write_str,pattern): while True: try: f=open(open_file,pattern) f.write(str(write_str)+"\n") f.close() break except Exception as e: exceputil.traceinfo(e) time.sleep(1)
def get_proxy_qyxx(self,need_check=False, is_debug=False, area=u"电信"): try: if config.debug: return None else: # if self.proxy_white_list_db.size() > 0: # return self.proxy_white_list_db.get() # else: return get("http://spider7:9876/qyxx?area=%s&type=%s" % (self.pinyin,self.proxy_typess)).text.strip() except Exception as e: exceputil.traceinfo(e) return proxyutils.choice_proxy(is_debug=False,area=u"电信",host=config.proxy_host,port=config.proxy_port)
def get_exec_files(style,dir_path): global fil_tuples for i in os.listdir(dir_path): file=os.path.join(dir_path,i) if os.path.isfile(file): if str(file).endswith(style) and not str(file).endswith("pyc"): if i not in not_tuples: fil_tuples.append(file) else: try: get_exec_files(style,file) except Exception as e: exceputil.traceinfo(e) return fil_tuples
def clear_log(root): for i in os.listdir(root): if re.match(r"qyxx_weixin", i): isfile = os.path.join(root, i) if os.path.isfile(isfile): if re.search(r"qyxx.*?log", i): file_change_time = os.path.getmtime(isfile) if nowtime - file_change_time > 3 * 24 * 3600: #os.remove(isfile) print isfile else: try: clear_log(isfile) except Exception as e: exceputil.traceinfo(e)
def record_success(self,yzm,img_path,count=10000): """ 打码成功后记录,文件名使用yzm :param yzm: 验证码 :param count: 保存验证码文件个数,默认10000个 :return: (None) """ try: dir_path=os.path.abspath('../') yzm_dir=os.path.join(dir_path,"yzm_success",self.pinyin) if not fileutil.isdir(yzm_dir): #建立目录 fileutil.mkdirs(yzm_dir) pics = sum([len(files) for root,dirs,files in os.walk(yzm_dir)]) self.logging.info(u"已存放%d张验证码图片"%(pics-1)) if pics > count: self.logging.warn(u"已存放超%d张验证码图片"%count) return # 唯一的验证码图片文件名 img = "%s.jpg"%str(uuid.uuid1()) # 记录图片与验证码对应关系 text_file_name=os.path.join(yzm_dir,"ans.txt") file=open(text_file_name,"a") file.write(img + ' ' + yzm + '\n') file.close() # 保存验证码图片 img_name=os.path.join(yzm_dir,img) fileutil.copyfile(img_path,img_name) except Exception as e: self.logging.error(u"记录发生异常.错误信息:%s" % exceputil.traceinfo(e))
def getsheet(filename, sheetname): # 打开 Excel 文件 try: device_workbook = xlrd.open_workbook(filename) excel_sheet = device_workbook.sheet_by_name(sheetname) return sheet(excel_sheet) except Exception as e: print u'文件名%s,错误信息:%s', (filename, traceinfo(e))
def get_index_and_other_list(self, key, data_list): try: i = 0 for data in data_list: i += 1 if key == data['_id']: return data, data_list[i:] except Exception as e: self.logging.error(u'获取列表剩余异常 %s' % exceputil.traceinfo(e))
def json_to_dict(self, data): try: if data != None: data_dict = json.loads(data) if data_dict != None: return data_dict else: return None except Exception as e: self.logging.error(u'转换dict异常 %s' % exceputil.traceinfo(e))
def exce_file_subprocess(exce_file): try: if exce_file.endswith("py"): #启动的py文件 child_sub=subprocess.Popen("%s %s"%(python_path,exce_file),shell=True)#,stderr=subprocess.STDOUT) elif exce_file.endswith("bat"): #启动windows的批处理 child_sub=subprocess.Popen(exce_file,shell=True) else: logging.info(u"启动的文件既不是py,也不是py,启动文件错误:%s"%exce_file) return c=str(child_sub.pid) write_file(u"./pid/%s_pid.txt"%dir_last_file,c+"|"+exce_file,"a+") child_sub.wait() time.sleep(1) except Exception as e: exceputil.traceinfo(e) print u"执行%s时候出错"%exce_file #杀死该进程 child_sub.kill() write_file(u"./pid/%s_error.txt"%dir_last_file,c+"|"+exce_file,"a+") time.sleep(3)
def __init__(self, queue_name): fileutil.mkdirs(queue_name) self.logging = get_logger(queue_name + '/' + 'ssdb_save') # if queue_name == u'shanghai_2': # self.queue_name = 'shanghai_2' # else: self.queue_name = queue_name self.db_name = 'bigdata_higgs_' + queue_name self.logging.info(self.db_name) while True: try: self.logging.info(u'连接mongo') self.mongo = mongoutil.getmondbv2( config.mongo_host, config.mongo_port, self.db_name, config.table_name, username=config.mongo_username, password=config.mongo_passwd) break except Exception as e: self.logging.error(u'连接mongo异常 %s' % exceputil.traceinfo(e)) time.sleep(60) continue while True: try: self.logging.info(u'连接ssdb') self.ssdb = ssdbutil2.getSSDBQueuev2(self.queue_name, host=config.ssdb_host, port=config.ssdb_port) break except Exception as e: self.logging.error(u'连接ssdb异常 %s' % exceputil.traceinfo(e)) time.sleep(60) continue
def get_all_from_ssdb(self): ssdb_list = [] while True: try: data = self.ssdb.get() if data != None and len(data) > 0: ssdb_list.append(pickle.loads(data[0])) if self.ssdb.size() == 0: break except Exception as e: self.logging.error(u'获取剩余全部队列数据异常 %s' % exceputil.traceinfo(e)) time.sleep(60) self.ssdb = ssdbutil2.getSSDBQueuev2(self.queue_name, host=config.ssdb_host, port=config.ssdb_port) return ssdb_list
def run(self): """线程主函数 1天持久化1次布隆过滤器 :return:(None) """ while True: try: now = time.time() subtime = now - self.store_last_time if subtime >= 3600 * 24: self.bloomfilter.write_file(self.storefile) self.store_last_time = time.time() time.sleep(24 * 3600) else: time.sleep(3600) except BaseException as e: logger.error(u"错误信息:%s" % traceinfo(e))
def back_money(self,recChar,code_id,yzm,img_path): """ 打码失败后请求退钱,并且验证码内容存储到文本文件和图片一起存储到self.pinyin目录,文件名使用code_id。 退钱正常的图片和验证码文本文件前缀为1,退钱失败前缀为0 :param recChar: :param code_id: 打码系统id :param yzm: 验证码 :param img_path: 图像地址 :return: (None) """ if code_id=="0": self.logging.warning(u"手工打码,无需退钱") return if recChar==None: self.logging.err(u"退钱发生异常。recChar==None") return #失败次数计数器加1 self.yzm_error+=1 today=timeutil.format("%Y-%m-%d",time.time()) dir_path=os.path.abspath('.') yzm_dir=os.path.join(dir_path,self.pinyin,today) if not fileutil.isdir(yzm_dir): #建立目录 fileutil.mkdirs(yzm_dir) try: #使用coide_id号退钱 recChar.reportErrorID(code_id) #退钱正常文件名前缀为1 img_name=os.path.join(yzm_dir,str(1),"%s.png"%code_id) text_file_name=os.path.join(yzm_dir,str(1),"%s.txt"%code_id) #把验证码文字写入到文本文件中,放到退钱目录 fileutil.write(text_file_name,yzm.encode("UTF-8","ignore")) #把图片文件复制到退钱的目录 fileutil.copyfile(img_path,img_name) self.logging.error(u"验证码没识别出来,退钱正常") except Exception as ee: #退钱失败文件名前缀为0 img_name="%s\\%d_%s.png" %(yzm_dir,0,code_id) text_file_name="%s\\%d_%s.txt"%(yzm_dir,0,code_id) #把验证码文字写入到文本文件中,放到退钱目录 fileutil.write(text_file_name,yzm.encode("UTF-8","ignore")) #把图片文件复制到退钱的目录 fileutil.copyfile(img_path,img_name) self.logging.error(u"验证码没识别出来,errorType=5 。退钱发生异常.error:%s" % exceputil.traceinfo(ee))
def get_logger(app, **kwargs): dict_config = DEFAULT_CONF dict_config['app'] = app dict_config.update(kwargs) logger = logging.getLogger(app) logger.setLevel(dict_config['level']) if dict_config['is_file']: # 日志文件名按时间自动更换 filehandler = TimedRotatingFileHandler( dict_config['filepath'] + dict_config['filename_prefix'], dict_config['when'], dict_config['interval'], dict_config['backup_count']) # 日志后缀名 filehandler.suffix = dict_config['filename_suffix'] # 每行日志的前缀设置 formatter = logging.Formatter(fmt=dict_config['format_file'], datefmt=dict_config['datefmt_file']) # 设置格式到日志对象 filehandler.setFormatter(formatter) filehandler.setLevel(dict_config['level_file']) logger.addHandler(filehandler) # 开启输出到屏幕 if dict_config['is_console']: console = logging.StreamHandler() console.setLevel(dict_config['level_console']) formatter_console = logging.Formatter(dict_config['format_console']) console.setFormatter(formatter_console) logger.addHandler(console) # 开启输出到屏幕 if dict_config['is_queue']: try: qh = QueueHandler(**dict_config) formatter_queue = logging.Formatter(dict_config['format_queue']) qh.setFormatter(formatter_queue) qh.setLevel(dict_config['level_queue']) logger.addHandler(qh) except Exception as e: print u'启动kafka 出错:{}'.format(traceinfo(e)) return logger
def get_data_from_ssdb(self): ssdb_list = [] count = 1000 retry_count = 10 while True: try: count -= 1 if count >= 0: data = self.ssdb.get() if data != None and len(data) > 0: ssdb_list.append(pickle.loads(data[0])) else: break except Exception as e: self.logging.error(u'从ssdb中弹出数据异常 %s' % exceputil.traceinfo(e)) retry_count -= 1 if retry_count > 0: time.sleep(60) self.ssdb = ssdbutil2.getSSDBQueuev2(self.queue_name, host=config.ssdb_host, port=config.ssdb_port) return ssdb_list
def save_data(self, last_failure_file='ssdb_mongo.data', wait_time=300): if os.path.exists(last_failure_file) == True: failed_list = [] count = 0 with open(last_failure_file, 'rb') as f: for line in f: failed_list.append( self.json_to_dict(line.strip().strip('\n'))) count += 1 while True: try: self.logging.info('Last Failed File :%d' % len(failed_list)) if failed_list != None and len(failed_list) > 0: for data in failed_list: if data == None: continue if '_id' in data.keys(): _id = data['_id'] else: _id = None if isinstance(data, dict) and _id != None: try: self.mongo.table.update({'_id': _id}, data, True) self.logging.info(u'成功update一条数据:%s' % _id) except Exception, e: self.logging.info(u'fail-update一条数据:%s' % _id) os.remove(last_failure_file) # insert_ret = self.mongo.table.insert(failed_list,safe = True) # if count - len(insert_ret) < 10 and count - len(insert_ret) >= 0: # os.remove(last_failure_file) # break # else: # time.sleep(5) # continue break except pymongo.errors.OperationFailure as e: # self.logging.error(exceputil.traceinfo(e)) if e != None and e != '': self.logging.info(e) _id = re.findall( r'.*?dup key:.*?\{.*?:.*?\"(.*?)\".*?\}', str(e)) self.logging.info('_id:%s' % _id[0]) if len(_id) > 0: update_data, other_list = self.get_index_and_other_list( _id[0], failed_list) if update_data != None: self.mongo.table.update( {'_id': update_data['_id']}, update_data, True) self.logging.info(u'update data:%s 成功' % _id[0]) if other_list != None and len(other_list) > 0: failed_list = other_list else: break continue else: break else: break except Exception as e: self.logging.error(u'存mongo数据异常 %s' % exceputil.traceinfo(e)) time.sleep(5) self.mongo = mongoutil.getmondbv2( config.mongo_host, config.mongo_port, self.db_name, config.table_name, username=config.mongo_username, password=config.mongo_passwd)
sheet = book.sheet_by_name(sheet_) if len(sheet.row_values(0)) > 2 and len( sheet.col_values(m)) > 2: col_data = sheet.col_values(m) else: continue if re.search(u"(企业)?(更名后名称)?", col_data[0]): begin_num = 1 else: begin_num = 0 for col in range(begin_num, len(col_data)): f1.write( col_data[col].strip().replace("•", "").encode("GBK") + "\n") except Exception as e: logging.error(exceputil.traceinfo(e)) continue time.sleep(2) f1.close() # # encoding : utf-8 #设置编码方式 # # import xlrd #导入xlrd模块 # # #打开指定文件路径的excel文件 # # xlsfile = r'D:\AutoPlan\apisnew.xls' # book = xlrd.open_workbook(xlsfile) #获得excel的book对象 # # #获取sheet对象,方法有2种: # sheet_name=book.sheet_names()[0] #获得指定索引的sheet名字
def parse_yzm(self,img_url,img_src,typecode,yzm_max_len=4,type=None): """ 对验证码进行人工打码验证 :param img_url: 验证码图片地址 :param img_src: 验证码图片内容 :param typecode: :param yzm_max_len: 验证码最大长度 :return: (unicode,unicode,bool,RecChar,unicode)(验证码内容, 打码系统id, 是否正常,打码对象,验证码图片地址) """ try: dir_path=os.path.abspath('.') urlpret = urlparse.urlparse(img_url) img_path = os.path.join(dir_path,"%s_%s.png"%(urlpret.hostname,self.pinyin)) print "img_path:", img_path, "type:", type fileutil.write(img_path,img_src) self.logging.info(u"请求验证码") #发送给打码公司打码 或 机器打码 if type!=None and len(type)>0: if self.recChar == None: self.recChar=RecChar(type) ret=self.recChar.rec(img_path) if ret!=None and len(ret)>0: yzm= str(ret[0]) print "yzm:",yzm if chardet.detect(yzm)['encoding'] == "utf-8": yzm = yzm.decode("utf-8") if yzm!=None and yzm.lower()=="none": yzm=None return yzm,"0",False,self.recChar,img_path else: raise Exception(u"机器打码返回值为None或长度为0.") else: if self.recChar == None: self.recChar = RecChar() self.yzm_count+=1 (yzm, code_id, is_report_error,img_path)=self.bbd_yzm(img_src) #(yzm, code_id, is_report_error) = self.recChar.rec(img_path, typecode=typecode); # 手工打码,用于测试 # recChar="" # yzm=raw_input() # yzm= yzm.decode("UTF-8",'ignore') # code_id="asdfasdfasdf" # is_report_error=False # print "yzm:",yzm self.logging.info(u"验证码返回结果,yzm:%s,code_id:%s,is_report_error:%s"%(yzm, str(code_id), str(is_report_error))) #退钱需要用coid_id,如果coid为空则证明没有打码失败没有收费,所以不需要退钱 # if len(str(code_id))<4: # self.logging.error(u"验证码识别错误。errorType=1,coid_id为空") # self.yzm_error+=1 # raise self.ValidYzmException(u"验证码识别错误") # #验证码内容为空 # if len(yzm)<1 or len(yzm)>yzm_max_len: # self.logging.error(u"验证码识别错误。errorType=2,验证码长度不在正确范围") # if len(code_id)>=4: # self.back_money(self.recChar,code_id,yzm,img_path) # else: # self.yzm_error+=1 # time.sleep(0.1) # raise self.ValidYzmException(u"验证码识别错误") return (yzm, code_id, is_report_error,self.recChar,img_path) except self.ValidYzmException as e1: self.logging.error(u"验证码处理异常,error:%s"%exceputil.traceinfo(e1)) raise except Exception as yzmerror: self.logging.error(u"验证码处理异常,errorType=4,error:%s"%exceputil.traceinfo(yzmerror)) # self.back_money(self.recChar,code_id,yzm,img_path) raise
def process(self): """ 主程序 1、获取上一次退出前最后下载失败的公司名并放到队列末尾 2、从队列中读取公司名,写到本地文件中 3、调用抓取调度模块,对公司名进行分词处理 4、若抓取失败,公司名会放到队列末尾并清理本地文件 5. 回到第2步 :return: (None) """ self.logging.info(u"开始%s站内容抓取" % self.chinese) #获取上一次退出前最后下载失败的公司名并放到队列末尾 self.process_last_error() #失败次数,连续n个公司失败后会休眠1小时并发邮件 fail_count = 10 while True: try: #从队列中读取公司名进行处理 company_name = self.pop_company().decode("UTF-8", "ignore") if len(company_name) < 1 or len(company_name) > 2000: raise Exception(u"公司长度不合理") self.keyword = company_name except Exception as e: self.logging.error(u"队列取值错误.error:%s" % exceputil.traceinfo(e)) time.sleep(1) continue #把redis队列中取到的内容写到本地文件中 fileutil.write(self.last_company_key_file, company_name.encode("UTF-8", "ignore")) #当前时间 #this_time = time.strftime(u"%Y-%m-%d %H:%M:%S",time.localtime()) try: #若当前的代理为自建代理,则更换代理,非自建代理则继续使用 self.logging.info(u"代理《%s》使用次数:%s"%(self.proxy,self.proxy_num)) try: if self.proxy: if self.pinyin not in proxy_series_configure : if str(self.proxy.split(":")[-1]) in ["42271","42272"]: self.proxy = self.get_useful_proxy() self.logging.error(u"使用优化代理:%s开始抓取,公司名:%s" % (self.proxy, company_name)) elif self.proxy_num>=proxy_none_series_configure.get(self.pinyin,50): self.proxy = self.get_useful_proxy() self.logging.error(u"使用优化代理:%s开始抓取,公司名:%s" % (self.proxy, company_name)) else: self.proxy_series_error=0 self.proxy_num+=1 elif self.proxy_num>=proxy_series_configure.get(self.pinyin,2000): self.proxy = self.get_useful_proxy() else: self.proxy_series_error=0 self.proxy_num+=1 else: self.proxy = self.get_useful_proxy() except Exception as e: self.logging.error(e) #调用抓取调度模块,对公司名进行分词处理 ret = self.crawler_scheduler(company_name) fileutil.clear(self.last_company_key_file) #成功,重置为初始值 fail_count = 10 except self.ValidException as e1: #连续失败计数 fail_count -= 1 #失败的公司名放到redis队列末尾 self.append_bottom(company_name) #清理存有上次处理公司名的文件 fileutil.clear(self.last_company_key_file) #如果连续失败10个公司验证码每个都失败10次则休眠1小时 if (fail_count < 0): #TODO #if hasattr(self,"kafka_mail"): # self.kafka_mail.send_mail(self.mail_list, u'%s站验证码识别异常报告' % self.chinese, u'公司爬取失败数超过10个') self.logging.error(u"公司爬取失败数超过10个") time.sleep(60 * 30) # 这里休眠时间缩短,机器打码不需要休眠太久 #睡醒了,重置为初始值 fail_count = 10 continue except Exception as e: self.logging.error(u"公司抓取异常。公司名:%s error:%s" % (company_name, exceputil.traceinfo(e))) #失败公司放到redis队列末尾 self.append_bottom(company_name) #清理存有上次处理公司名的文件 fileutil.clear(self.last_company_key_file) self.logging.info(u"%s站内容抓取完成" % self.chinese)
JSONEncoder().encode(line) + '\n' for line in data_list ] f.writelines(other_list) self.logging.info(u'获取一千个数据') end_time = time.time() self.logging.info(u'获取一千条数据消耗时间为:%d' % (end_time - start_time)) break else: self.logging.info(u'休眠5s,等待数据') count_time += 1 time.sleep(5) continue except Exception as e: self.logging.error(u'取数据异常 %s' % exceputil.traceinfo(e)) time.sleep(30) self.ssdb = ssdbutil2.getSSDBQueuev2(self.queue_name, host=config.ssdb_host, port=config.ssdb_port) start_time = None end_time_1 = None end_time_2 = None while True: try: if data_list != None and len(data_list) > 0: count = len(data_list) start_time = time.time() for data in data_list: if data == None:
def crawler_scheduler(self, company_name): """ 抓取调度器。 1.公司名进行分词,然后用分词后的关键字去查询。 2.如果有返回内容并且包含指定公司则停止抓取,返回True。。 3.如果有返回内容达到max_num_perpage条但不包含指定公司则继续抓取,并设置关键字白名单和公司白名单。 4.如果没有返回内容则结束抓取。 :param company_name: (str) 公司名 -> 北京钢铁公司 :return: True/unicode -> True :抓取到内容并且包含指定公司 "没有指定公司" :抓取到数据,没有指定公司 "没有查询到企业信息" :没有抓取到任何内容 """ has_data = False try: self.logging.info(u"分词:%s"%company_name) company_dic = {} is_dic = False try: #转换抓取的种子,目前种子可能是一个字典:例如: pattern_str = r"^\d{%d}" % len(company_name) if re.match(pattern_str, company_name): company_dic[u"zch"] = company_name else: if company_name.startswith('{') and company_name.endswith('}'): company_name=json.loads(company_name) company_dic.update(company_name) else: if not isinstance(company_name,unicode): encoding= chardet.detect(company_name).get("encoding") if encoding: if encoding=='ascii': self.logging.error(u"关键字编码错误") return 0,False else: company_name=company_name.decode(encoding,'ignore') company_dic['name']=company_name if 'keyword' in company_dic and ('zch' in company_dic.get('keyword') or 'name' in company_dic.get('keyword')): company_dic.update(company_dic.get('keyword')) is_dic = True except Exception as e: self.logging.error(u"种子队列转换出错:%s" % e) company_dic[u"name"] = company_name pass #抓取优先级为:注册号--》公司名----》信用代码 company_count = 0 inner_company = False is_Exception = [] #使用注册号抓取 if company_dic.get(u"zch"): try: temp_key = company_dic.get(u"zch") self.logging.info(u"抓取(1):%s" % temp_key) company_count, inner_company = self.crawler(temp_key, temp_key) except Exception as e: is_Exception.append(e) #使用公司url抓取 if company_count <1 and company_dic.get(u"name") and company_dic.get(u"url"): try: company_name=company_dic.get(u"name") self.logging.info(u"抓取(2):%s" % company_name) company_url=company_dic.get(u"url") company_count,inner_company=self.crawler_url(company_url,company_name) except Exception as e: is_Exception.append(e) #使用url抓取,能抓到公司,但是没有抓到该公司 if company_count>=1 and inner_company==False and company_dic.get(u"name"): try: temp_key = company_dic.get(u"name") self.logging.info(u"抓取(3):%s" % temp_key) company_count, inner_company = self.crawler(temp_key, temp_key) except Exception as e: is_Exception.append(e) #若使用注册号没有抓取到,则使用公司名抓取 if company_count < 1 and company_dic.has_key(u"name") and company_dic.get(u"name", u""): try: temp_key = company_dic.get(u"name", u"") self.logging.info(u"抓取(4):%s" % temp_key) company_count, inner_company = self.crawler(temp_key, temp_key) except Exception as e: is_Exception.append(e) #若使用公司名没有抓取到,则使用抓取信用代码抓取 if company_count < 1 and company_dic.has_key(u"xydm") and company_dic.get(u"xydm", u""): try: temp_key = company_dic.get(u"xydm", u"") self.logging.info(u"抓取(5):%s" % temp_key) company_count, inner_company = self.crawler(temp_key, temp_key) except Exception as e: is_Exception.append(e) #判断书否为元祖队列,且没有抓取的,记录数据,用于排查 if company_count < 1 and is_dic: try: if is_Exception: company_dic[u"exception"] = u"no" else: company_dic[u"exception"] = u"yes" self.queue.select_queue(self.pinyin + '_noncompany_dic') self.queue.save(company_dic) except: self.logging.error(u"保存为抓取的元祖队列失败:%s" % company_name) #程序抛错 if is_Exception: raise json.dumps(is_Exception) #raise Exception('\n'.join(map(lambda x:x.decode(chardet.detect(x).get("encoding","UTF-8"),'ignore') if not isinstance(x,object) else str(x),is_Exception))) if company_count < 1: #没有抓取内容''' self.set_black_keyword(company_dic) # break else: #有抓取内容''' self.set_white_key(company_name) if inner_company == True: return True else: has_data = True # if company_count < self.max_num_perpage: # break except self.ValidException as e1: self.logging.error(u"验证码异常,关键字:%s,错误信息:%s" % (company_name, exceputil.traceinfo(e1))) raise except Exception as e: self.logging.error(u"关键字:%s,错误信息:%s" % (company_name, exceputil.traceinfo(e))) raise if has_data: return u"没有指定公司" else: return u"没有查询到企业信息"