class TaskManager(): IS_IN_TIME_RANGE = 1 # 在时间范围 NOT_REACH_TIME_RANGE = 2 # 没到达时间范围 OVER_MIN_TIME_RANGE = 3 # 超过时间范围 def __init__(self): self._mysqldb = MysqlDB(**config.get('mysqldb')) self._redis = RedisDB(**config.get('redisdb')) self._task_root_key = config.get('spider').get('redis_task_cache_root_key') self._account_task_key = self._task_root_key + ':z_account_task' self._article_task_key = self._task_root_key + ':z_article_task' self._last_article_publish_time = self._task_root_key + ':h_last_article_publish_time' self._new_last_article_publish_time = self._task_root_key + ':h_new_last_article_publish_time' self._ignore_haved_crawl_today_article_account = config.get('spider').get('ignore_haved_crawl_today_article_account') self._monitor_interval = config.get('spider').get('monitor_interval') self._zombie_account_not_publish_article_days = config.get('spider').get('zombie_account_not_publish_article_days') self._spider_interval_min = config.get('spider').get('spider_interval').get('min_sleep_time') self._spider_interval_max = config.get('spider').get('spider_interval').get('max_sleep_time') self._spider_interval_max = config.get('spider').get('spider_interval').get('max_sleep_time') self._crawl_time_range = (config.get("spider").get("crawl_time_range") or "~").split('~') def __get_task_from_redis(self, key): task = self._redis.zget(key, is_pop=True) if task: task = eval(task[0]) return task def __random_int(self, min, max): pass def get_account_task(self): """ 获取公众号任务 :return: {'__biz': 'Mjc1NjM3MjY2MA==', 'last_publish_time': None} 或 None """ task = self.__get_task_from_redis(self._account_task_key) if not task: publish_time_condition = "AND last_publish_time < '{today}'".format(today=tools.get_current_date(date_format='%Y-%m-%d' + ' 00:00:00')) if self._ignore_haved_crawl_today_article_account else '' sql = ''' SELECT __biz, last_publish_time FROM wechat_account_task WHERE `is_zombie` != 1 AND ( ( ( UNIX_TIMESTAMP(CURRENT_TIMESTAMP) - UNIX_TIMESTAMP(last_spider_time) ) > {monitor_interval} {publish_time_condition} ) OR (last_spider_time IS NULL) ) '''.format(monitor_interval=self._monitor_interval, publish_time_condition=publish_time_condition) tasks = self._mysqldb.find(sql, to_json=True) if tasks: self._redis.zadd(self._account_task_key, tasks) task = self.__get_task_from_redis(self._account_task_key) return task def get_article_task(self): """ 获取文章任务 :return: {'article_url': 'http://mp.weixin.qq.com/s?__biz=MzIxNzg1ODQ0MQ==&mid=2247485501&idx=1&sn=92721338ddbf7d907eaf03a70a0715bd&chksm=97f220dba085a9cd2b9a922fb174c767603203d6dbd2a7d3a6dc41b3400a0c477a8d62b96396&scene=27#wechat_redirect'} 或 None """ task = self.__get_task_from_redis(self._article_task_key) if not task: sql = 'select id, article_url from wechat_article_task where state = 0 limit 5000' tasks = self._mysqldb.find(sql) if tasks: # 更新任务状态 task_ids = str(tuple([task[0] for task in tasks])).replace(',)', ')') sql = 'update wechat_article_task set state = 2 where id in %s' % (task_ids) self._mysqldb.update(sql) else: sql = 'select id, article_url from wechat_article_task where state = 2 limit 5000' tasks = self._mysqldb.find(sql) if tasks: task_json = [ { 'article_url': article_url } for id, article_url in tasks ] self._redis.zadd(self._article_task_key, task_json) task = self.__get_task_from_redis(self._article_task_key) return task def update_article_task_state(self, sn, state=1): sql = 'update wechat_article_task set state = %s where sn = "%s"' % (state, sn) self._mysqldb.update(sql) def record_last_article_publish_time(self, __biz, last_publish_time): self._redis.hset(self._last_article_publish_time, __biz, last_publish_time or '') def is_reach_last_article_publish_time(self, __biz, publish_time): last_publish_time = self._redis.hget(self._last_article_publish_time, __biz) if not last_publish_time: # 查询mysql里是否有该任务 sql = "select last_publish_time from wechat_account_task where __biz = '%s'" % __biz data = self._mysqldb.find(sql) if data: # [(None,)] / [] last_publish_time = str(data[0][0] or '') self.record_last_article_publish_time(__biz, last_publish_time) if last_publish_time is None: return if publish_time < last_publish_time: return True return False def is_in_crawl_time_range(self, publish_time): """ 是否在时间范围 :param publish_time: :return: 是否达时间范围 """ if not publish_time or (not self._crawl_time_range[0] and not self._crawl_time_range[1]): return TaskManager.IS_IN_TIME_RANGE if self._crawl_time_range[0]: # 时间范围 上限 if publish_time > self._crawl_time_range[0]: return TaskManager.NOT_REACH_TIME_RANGE if publish_time <= self._crawl_time_range[0] and publish_time >= self._crawl_time_range[1]: return TaskManager.IS_IN_TIME_RANGE if publish_time < self._crawl_time_range[1]: # 下限 return TaskManager.OVER_MIN_TIME_RANGE return TaskManager.IS_IN_TIME_RANGE def record_new_last_article_publish_time(self, __biz, new_last_publish_time): self._redis.hset(self._new_last_article_publish_time, __biz, new_last_publish_time) def get_new_last_article_publish_time(self, __biz): return self._redis.hget(self._new_last_article_publish_time, __biz) def update_account_last_publish_time(self, __biz, last_publish_time): sql = 'update wechat_account_task set last_publish_time = "{}", last_spider_time="{}" where __biz="{}"'.format( last_publish_time, tools.get_current_date(), __biz ) self._mysqldb.update(sql) def is_zombie_account(self, last_publish_timestamp): if tools.get_current_timestamp() - last_publish_timestamp > self._zombie_account_not_publish_article_days * 86400: return True return False def sign_account_is_zombie(self, __biz, last_publish_time=None): if last_publish_time: sql = 'update wechat_account_task set last_publish_time = "{}", last_spider_time="{}", is_zombie=1 where __biz="{}"'.format( last_publish_time, tools.get_current_date(), __biz ) else: sql = 'update wechat_account_task set last_spider_time="{}", is_zombie=1 where __biz="{}"'.format( tools.get_current_date(), __biz ) self._mysqldb.update(sql) def get_task(self, url=None, tip=''): """ 获取任务 :param url: 指定url时,返回该url包装后的任务。否则先取公众号任务,无则取文章任务。若均无任务,则休眠一段时间之后再取 :return: """ sleep_time = random.randint(self._spider_interval_min, self._spider_interval_max) if not url: account_task = self.get_account_task() if account_task: __biz = account_task.get('__biz') last_publish_time = account_task.get('last_publish_time') self.record_last_article_publish_time(__biz, last_publish_time) tip = '正在抓取列表' url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={}&scene=124#wechat_redirect'.format(__biz) else: article_task = self.get_article_task() if article_task: tip = '正在抓取详情' url = article_task.get('article_url') else: sleep_time = config.get('spider').get('no_task_sleep_time') log.info('暂无任务 休眠 {}s'.format(sleep_time)) tip = '暂无任务 ' if url: next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.href='{url}';}},{sleep_time_msec});</script>".format( tip=tip and tip + ' ', sleep_time=sleep_time, begin_spider_time=tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time), url=url, sleep_time_msec=sleep_time * 1000 ) else: next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.reload();}},{sleep_time_msec});</script>".format( tip=tip and tip + ' ', sleep_time=sleep_time, begin_spider_time=tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time), sleep_time_msec=sleep_time * 1000 ) return next_page def reset_task(self): # 清除redis缓存 keys = self._task_root_key + "*" keys = self._redis.getkeys(keys) if keys: for key in keys: self._redis.clear(key) # 重设任务 sql = "update wechat_article_task set state = 0 where state = 2" self._mysqldb.update(sql)
class ExportData(): INSERT = 1 UPDATE = 2 EXCEPTION = 3 def __init__(self, source_table='', aim_table='', key_map='', unique_key=None, unique_key_mapping_source_key=None, update_read_status=True, condition={'read_status': 0}, datas=[], callback='', sync_to_es=False): ''' @summary: 初始化 --------- @param source_table: 源table mongo数据库 @param aim_table: 目标table @param key_map: 目标table 和 源table 的键的映射 eg: key_map = { 'aim_key1' : 'str_source_key2', # 目标键 = 源键对应的值 类型为str 'aim_key2' : 'int_source_key3', # 目标键 = 源键对应的值 类型为int 'aim_key3' : 'date_source_key4', # 目标键 = 源键对应的值 类型为date 'aim_key4' : 'vint_id', # 目标键 = 值 类型为int 'aim_key5' : 'vstr_name', # 目标键 = 值 类型为str 'aim_key6' : 'vdate_name', # 目标键 = 值 类型为date 'aim_key7' : 'sint_select id from xxx' # 目标键 = 值为sql 查询出的结果 类型为int 'aim_key8' : 'sstr_select name from xxx' # 目标键 = 值为sql 查询出的结果 类型为str 'aim_key9' : 'clob_key8' # 目标键 = 源键对应的值 类型为clob 'aim_key10' : 'clob_key8' # 目标键 = 源键对应的值 类型为str } @param unique_key: 唯一的key 目标数据库根据该key去重 @param unique_key_mapping_source_key: 目标表中唯一的key所对应的源表中的key 该值不为空时 更新目标表中已有的数据 eg: unique_key_mapping_source_key = { 'url':'str_url' # 目标键 = 源键对应的值 类型为str } @param condition: 导出满足什么样条件的数据 默认是read_status = 0 的 @param datas: 要导出的数据,格式为[{...},{...}] 或者 {}用于直接将json数组导入到目标表,为空时默认导出mongodb的数据 @param callback 导出数据的回调,导出一组,执行一次,callback(execute_type, sql) execute_type为执行类型(ExportData.INSERT、ExportData.UPDATE、ExportData.EXCEPTION) sql 为执行的语句 --------- @result: ''' super(ExportData, self).__init__() self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._update_read_status = update_read_status self._condition = condition self._mongodb = MongoDB() if self._source_table else '' self._datas = datas self._sync_to_es = sync_to_es self._callback = callback self._is_oracle = False self._is_set_unique_key = False self._is_set_unique_key = False self._export_count = 0 self._update_count = 0 self._unique_key_mapping_source_key = unique_key_mapping_source_key def export_to_oracle(self, source_table='', aim_table='', key_map='', unique_key=None, unique_key_mapping_source_key=None, update_read_status=True, condition={'read_status': 0}, datas=[], callback='', sync_to_es=False): if aim_table: if self._aim_table != aim_table: self._is_set_unique_key = False self._es = ES() if sync_to_es else '' self._mongodb = MongoDB() if source_table else '' self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._export_count = 0 self._update_count = 0 self._unique_key_mapping_source_key = unique_key_mapping_source_key self._update_read_status = update_read_status if not datas else False self._condition = condition self._datas = datas self._callback = callback self._sync_to_es = sync_to_es self._es = None self._aim_db = OracleDB() self._is_oracle = True return self.__export() def export_to_mysql(self, source_table='', aim_table='', key_map='', unique_key=None, unique_key_mapping_source_key=None, update_read_status=True, condition={'read_status': 0}, datas=[], callback=''): if self._aim_table != aim_table: self._is_set_unique_key = False self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._export_count = 0 self._update_count = 0 self._unique_key_mapping_source_key = unique_key_mapping_source_key self._update_read_status = update_read_status if not datas else False self._condition = condition self._datas = datas self._callback = callback self._aim_db = MysqlDB() return self.__export() def make_sql(self, data): ''' @summary: --------- @param data: 数据字典 --------- @result: 当unique_key_mapping_source_key不为空时返回insert_sql, update_sql 否则返回insert_sql ''' aim_keys = tuple(self._key_map.keys()) source_keys = tuple(self._key_map.values()) # 取源key值 对应的type 和 key (源key包含type 和 key 信息) keys = [] value_types = [] for source_key in source_keys: temp_var = source_key.split('_', 1) value_types.append(temp_var[0]) keys.append(temp_var[1]) insert_sql = 'insert into ' + self._aim_table + " (" + ', '.join( aim_keys) + ") values (" update_sql = 'update ' + self._aim_table + " set " data_json = {} # 导入到es中用 values = [] for i in range(len(keys)): if (value_types[i] != 'vint' and value_types[i] != 'vstr' and value_types[i] != 'vdate' and value_types[i] != 'sint' and value_types[i] != 'sstr') and (not data[keys[i]] and data[keys[i]] != 0): values.append('null') insert_sql += '%s, ' update_sql += aim_keys[i] + " = %s, " % values[-1] data_json[aim_keys[i].upper()] = None elif value_types[i] == 'str': values.append( str(data[keys[i]]).replace("'", "''") ) # if isinstance(data[keys[i]], str) else data[keys[i]]) # 将单引号替换成两个单引号 否者insert_sql语句语法出错 insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'clob': text = str(data[keys[i]]).replace("'", "''") if not text: insert_sql += "'%s', " values.append(text) update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = None else: values_ = tools.cut_string(text, 1000) clob_text = '' for value in values_: clob_text += "to_clob('%s') || " % value clob_text = clob_text[:-len(' || ')] values.append(clob_text) insert_sql += "%s, " update_sql += aim_keys[i] + " = %s, " % values[-1] data_json[aim_keys[i].upper()] = data[keys[i]] elif value_types[i] == 'int': if isinstance(data[keys[i]], int) or isinstance( data[keys[i]], float) or isinstance( data[keys[i]], str): values.append(data[keys[i]]) elif isinstance(data[keys[i]], bool): values.append(data[keys[i]] and 1 or 0) else: # _id values.append(int(str(data[keys[i]])[-6:], 16)) insert_sql += '%s, ' update_sql += aim_keys[i] + " = %s, " % values[-1] data_json[aim_keys[i].upper()] = eval( values[-1]) if isinstance(values[-1], str) else values[-1] elif value_types[i] == 'date': values.append(data[keys[i]].replace('年', '-').replace( '月', '-').replace('日', '')) if self._is_oracle: format_date = 'yyyy-mm-dd hh24:mi:ss'[:len( values[-1]) if len(values[-1]) <= 10 else None] insert_sql += "to_date('%s','{}'), ".format(format_date) update_sql += aim_keys[i] + "= to_date('%s','%s'), " % ( values[-1], format_date) data_json[aim_keys[i].upper()] = values[-1] else: insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'vint': if tools.get_english_words(keys[i]): sql = 'select %s from dual' % keys[i] value = self._aim_db.find(sql)[0][0] values.append(value) data_json[aim_keys[i].upper()] = values[-1] else: values.append(keys[i]) data_json[aim_keys[i].upper()] = eval(values[-1]) insert_sql += '%s, ' update_sql += aim_keys[i] + " = %s, " % values[-1] elif value_types[i] == 'vstr': values.append(keys[i]) insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'vdate': values.append(keys[i]) if self._is_oracle: format_date = 'yyyy-mm-dd hh24:mi:ss'[:len( values[-1]) if len(values[-1]) <= 10 else None] insert_sql += "to_date('%s','{}'), ".format(format_date) update_sql += aim_keys[i] + "= to_date('%s','%s'), " % ( values[-1], format_date) data_json[aim_keys[i].upper()] = values[-1] else: insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'sint': value = self._aim_db.find(keys[i], fetch_one=True)[0] values.append(value) insert_sql += '%s, ' update_sql += aim_keys[i] + " = %s, " % value data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'sstr': value = self._aim_db.find(keys[i], fetch_one=True)[0] values.append(value) insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % value data_json[aim_keys[i].upper()] = values[-1] else: error_msg = '%s不符合key_map规定格式' % value_types[i] raise (Exception(error_msg)) insert_sql = insert_sql[:-2] + ")" insert_sql = insert_sql % tuple(values) # tools.print(data_json) # log.debug(insert_sql) if self._unique_key_mapping_source_key: # aim_key = tuple(self._unique_key_mapping_source_key.keys())[0] # value = tuple(self._unique_key_mapping_source_key.values())[0] # temp_var = value.split('_', 1) # source_key_types = temp_var[0] # source_key = temp_var[1] # if source_key_types == 'str': # update_sql = update_sql[:-2] + " where %s = '%s'" %(aim_key, data[source_key]) # elif source_key_types == 'int': # update_sql = update_sql[:-2] + " where %s = %s" %(aim_key, data[source_key]) # # log.debug(update_sql) return insert_sql, update_sql[:-2], data_json else: return insert_sql, data_json # @tools.run_safe_model(__name__) def __export(self): if self._unique_key and not self._is_set_unique_key: self._aim_db.set_unique_key(self._aim_table, self._unique_key) self._is_set_unique_key = True datas = self._mongodb.find( self._source_table, condition=self._condition) if self._mongodb else ( self._datas if isinstance(self._datas, list) else [self._datas]) for data in datas: if self._unique_key_mapping_source_key: insert_sql, update_sql, data_json = self.make_sql(data) else: insert_sql, data_json = self.make_sql(data) # tools.write_file(self._aim_table + '.txt', insert_sql, 'w+') def exception_callfunc(e): if 'ORA-00001' in str(e): if self._update_read_status: self._mongodb.update(self._source_table, data, {'read_status': 1}) else: log.error(insert_sql) execute_type = ExportData.EXCEPTION sql = '' # log.debug(insert_sql) if self._aim_db.add(insert_sql, exception_callfunc): self._export_count += 1 sql = insert_sql execute_type = ExportData.INSERT if self._update_read_status: self._mongodb.update(self._source_table, data, {'read_status': 1}) elif self._unique_key_mapping_source_key: # 取id字段 aim_key = tuple(self._unique_key_mapping_source_key.keys())[0] value = tuple(self._unique_key_mapping_source_key.values())[0] temp_var = value.split('_', 1) source_key_types = temp_var[0] source_key = temp_var[1] select_sql = 'select id from ' + self._aim_table if source_key_types == 'str': select_sql = select_sql + " where %s = '%s'" % ( aim_key, data[source_key]) elif source_key_types == 'int': select_sql = select_sql + " where %s = %s" % ( aim_key, data[source_key]) data_id = self._aim_db.find(select_sql) if data_id: data_id = data_id[0][0] else: continue #拼接update语句 update_sql += " where id = %s" % data_id log.debug(update_sql) # 删除 update 里面 id= xxx 的条件,保证更新后的数据 ID不变 id_info = ''.join( tools.get_info(update_sql, [' id .*?,', ' ID .*?,'])) update_sql = update_sql.replace(id_info, '') # 修改data_json 里的ID if "ID" in data_json.keys(): data_json["ID"] = data_id # 更新 if self._aim_db.update(update_sql): self._update_count += 1 sql = update_sql execute_type = ExportData.UPDATE if self._update_read_status: self._mongodb.update(self._source_table, data, {'read_status': 1}) # 同步到ES if self._sync_to_es and execute_type != ExportData.EXCEPTION: self._es.add(table=self._aim_table, data=data_json, data_id=data_json.get('ID')) if self._callback: self._callback(execute_type, sql, data_json) log.debug(''' 共导出%s条数据 共更新%s条数据 ''' % (self._export_count, self._update_count)) return self._export_count + self._update_count def close(self): self._aim_db.close()