class Collector(threading.Thread): def __init__(self, tab_urls): super(Collector, self).__init__() self._lock = threading.RLock() self._db = MongoDB() self._thread_stop = False self._urls = [] self._null_times = 0 self._read_pos = -1 self._write_pos = -1 self._tab_urls = tab_urls self._depth = int( tools.get_conf_value('config.conf', "collector", "depth")) self._max_size = int( tools.get_conf_value('config.conf', "collector", "max_size")) self._interval = int( tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int( tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int( tools.get_conf_value('config.conf', "collector", "url_count")) #初始时将正在做的任务至为未做 self._db.update(self._tab_urls, {'status': Constance.DOING}, {'status': Constance.TODO}) self._finished_callback = None def run(self): while not self._thread_stop: self.__input_data() time.sleep(self._interval) def stop(self): self._thread_stop = True if self._finished_callback: self._finished_callback() @tools.log_function_time def __input_data(self): log.debug('read_pos %d, write_pos %d buffer size %d' % (self._read_pos, self._write_pos, self.get_max_read_size())) log.debug('buffer can write size = %d' % self.get_max_write_size()) if self.get_max_write_size() == 0: log.debug("collector 已满 size = %d" % self.get_max_read_size()) return url_count = self._url_count if self._url_count <= self.get_max_write_size( ) else self.get_max_write_size() urls_list = [] if self._depth: urls_list = self._db.find(self._tab_urls, { "status": Constance.TODO, "depth": { "$lte": self._depth } }, limit=url_count) else: urls_list = self._db.find(self._tab_urls, {"status": Constance.TODO}, limit=url_count) #更新已取到的url状态为doing for url in urls_list: self._db.update(self._tab_urls, url, {'status': Constance.DOING}) # 存url self.put_urls(urls_list) if self.is_all_have_done(): self.stop() def is_finished(self): return self._thread_stop def add_finished_callback(self, callback): self._finished_callback = callback # 没有可做的url def is_all_have_done(self): if self.get_max_read_size() == 0: self._null_times += 1 if self._null_times >= self._allowed_null_times: #检查数据库中有没有正在做的url urls_doing = self._db.find(self._tab_urls, {'status': Constance.DOING}) if urls_doing: self._null_times = 0 return False else: return True else: self._null_times = 0 return False def get_max_write_size(self): size = 0 if self._read_pos == self._write_pos: size = self._max_size elif self._read_pos < self._write_pos: size = self._max_size - (self._write_pos - self._read_pos) else: size = self._read_pos - self._write_pos return size - 1 def get_max_read_size(self): return self._max_size - 1 - self.get_max_write_size() @tools.log_function_time def put_urls(self, urls_list): if urls_list == []: return # 添加url 到 _urls url_count = len((urls_list)) end_pos = url_count + self._write_pos + 1 # 判断是否超出队列容量 超出的话超出的部分需要从头写 # 超出部分 overflow_end_pos = end_pos - self._max_size # 没超出部分 in_pos = end_pos if end_pos <= self._max_size else self._max_size # 没超出部分的数量 urls_listCutPos = in_pos - self._write_pos - 1 self._lock.acquire() #加锁 self._urls[self._write_pos + 1:in_pos] = urls_list[:urls_listCutPos] if overflow_end_pos > 0: self._urls[:overflow_end_pos] = urls_list[urls_listCutPos:] self._lock.release() self._write_pos += url_count self._write_pos %= self._max_size # -1 取余时问题 -1 % 1000 = 999 这样can write size 为0 urls_list为空时返回 规避了这个问题 @tools.log_function_time def get_urls(self, count): self._lock.acquire() #加锁 urls = [] count = count if count <= self.get_max_read_size( ) else self.get_max_read_size() end_pos = self._read_pos + count + 1 if end_pos > self._max_size: urls.extend(self._urls[self._read_pos + 1:]) urls.extend(self._urls[:end_pos % self._max_size]) else: urls.extend(self._urls[self._read_pos + 1:end_pos]) if urls: self._read_pos += len(urls) self._read_pos %= self._max_size self._lock.release() return urls
class ExportData(): def __init__(self, source_table, aim_table, key_map, unique_key=None): ''' @summary: 初始化 --------- @param source_table: 源table @param aim_table: 目标table @param key_map: 目标table 和 源table 的键的映射 eg: key_map = { 'aim_key1' : 'str_source_key2', # 目标键 = 源键对应的值 类型为str 'aim_key2' : 'int_source_key3', # 目标键 = 源键对应的值 类型为int 'aim_key3' : 'date_source_key4', # 目标键 = 源键对应的值 类型为date 'aim_key4' : 'vint_id', # 目标键 = 值 类型为int 'aim_key5' : 'vstr_name', # 目标键 = 值 类型为str 'aim_key6' : 'sint_select id from xxx' # 目标键 = 值为sql 查询出的结果 类型为int 'aim_key7' : 'sstr_select name from xxx' # 目标键 = 值为sql 查询出的结果 类型为str } @param unique_key: 唯一的key 目标数据库根据该key去重 --------- @result: ''' super(ExportData, self).__init__() self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._mongodb = MongoDB() self._is_oracle = False self._export_count = 0 def export_to_oracle(self): self._aim_db = OracleDB() self._is_oracle = True self.__export() def export_to_mysql(self): self._aim_db = MysqlDB() self.__export() # @tools.run_safe_model(__name__) def __export(self): if self._unique_key: self._aim_db.set_unique_key(self._aim_table, self._unique_key) aim_keys = tuple(self._key_map.keys()) source_keys = tuple(self._key_map.values()) # 取源key值 对应的type 和 key (源key包含type 和 key 信息) keys = [] value_types = [] for source_key in source_keys: temp_var = source_key.split('_', 1) value_types.append(temp_var[0]) keys.append(temp_var[1]) datas = self._mongodb.find(self._source_table, {'read_status': 0}) for data in datas: sql = 'insert into ' + self._aim_table + " (" + ', '.join( aim_keys) + ") values (" values = [] for i in range(len(keys)): if value_types[i] == 'str': values.append(data[keys[i]].replace( "'", "''")) # 将单引号替换成两个单引号 否者sql语句语法出错 sql += "'%s', " elif value_types[i] == 'int': values.append(data[keys[i]]) if isinstance(data[keys[i]], int): sql += '%d, ' else: sql += '%s, ' elif value_types[i] == 'date': values.append(data[keys[i]]) if self._is_oracle: sql += "to_date('%s','yyyy-mm-dd hh24:mi:ss'), " else: sql += "'%s', " elif value_types[i] == 'vint': values.append(keys[i]) sql += '%s, ' elif value_types[i] == 'vstr': values.append(keys[i]) sql += "'%s', " elif value_types[i] == 'sint': value = self._oracledb.find(keys[i], fetch_one=True) values.append(value) sql += '%d, ' elif value_types[i] == 'sstr': value = self._oracledb.find(keys[i], fetch_one=True) values.append(value) sql += "'%s', " else: log.error('%s不符合key_map规定格式' % value_types[i]) return sql = sql[:-2] + ")" sql = sql % tuple(values) log.debug(sql) if self._aim_db.add(sql): self._export_count += 1 self._mongodb.update(self._source_table, data, {'read_status': 1}) # self._aim_db.close() log.debug('共导出%d条数据' % self._export_count)
def download(title=None): if title is not None: zhuishu = ZhuiShuSpider() # babadushu = BaBaDuShuSpider() # liewen = LieWenSpider() mongodb = MongoDB('zhuishu') # 想要爬取哪个网站,就把哪个网站的爬虫实现类传入,比如,我这里传入的是 追书网实例 novel = spider.Spider(zhuishu, mongodb) novel.search(title) else: print('请输入要下载的小说名') def make_txt(book): converter = ConvertToTxt(book['title']) g = generator.Generator(converter) g.make(book) def make_epub(book): epub = ConvertToEpub(book['title'], book['author'], book['cover'], book['intro'], book['chapters']) epub.make() if __name__ == '__main__': mongodb = MongoDB('zhuishu') book = mongodb.find(title='天将夜').next() make_epub(book)
# -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: dbshot Description : Author : Lychlov date: 2018/5/24 ------------------------------------------------- Change Activity: 2018/5/24: ------------------------------------------------- """ from db.mongodb import MongoDB temp_dict = {"title": "政变四周年,曼谷反军方大示威今日正式爆发!", "summary": 'jianjie', "cover": "http://sdfsdf", "receive_time": "2018-05-23 23:23:23", "account": '泰国网'} temp_dict2 = {"title": "政变四周年,曼谷反军方大示威今日正式爆发!", "summary": 'jianjie', "account": '泰国网'} mongodb = MongoDB() add_res = mongodb.add('wechat_article', temp_dict) res = mongodb.find('wechat_article', temp_dict2) print(res)
class ExportData(): INSERT = 1 UPDATE = 2 EXCEPTION = 3 def __init__(self, source_table='', aim_table='', key_map='', unique_key=None, unique_key_mapping_source_key=None, update_read_status=True, condition={'read_status': 0}, datas=[], callback='', sync_to_es=False): ''' @summary: 初始化 --------- @param source_table: 源table mongo数据库 @param aim_table: 目标table @param key_map: 目标table 和 源table 的键的映射 eg: key_map = { 'aim_key1' : 'str_source_key2', # 目标键 = 源键对应的值 类型为str 'aim_key2' : 'int_source_key3', # 目标键 = 源键对应的值 类型为int 'aim_key3' : 'date_source_key4', # 目标键 = 源键对应的值 类型为date 'aim_key4' : 'vint_id', # 目标键 = 值 类型为int 'aim_key5' : 'vstr_name', # 目标键 = 值 类型为str 'aim_key6' : 'vdate_name', # 目标键 = 值 类型为date 'aim_key7' : 'sint_select id from xxx' # 目标键 = 值为sql 查询出的结果 类型为int 'aim_key8' : 'sstr_select name from xxx' # 目标键 = 值为sql 查询出的结果 类型为str 'aim_key9' : 'clob_key8' # 目标键 = 源键对应的值 类型为clob 'aim_key10' : 'clob_key8' # 目标键 = 源键对应的值 类型为str } @param unique_key: 唯一的key 目标数据库根据该key去重 @param unique_key_mapping_source_key: 目标表中唯一的key所对应的源表中的key 该值不为空时 更新目标表中已有的数据 eg: unique_key_mapping_source_key = { 'url':'str_url' # 目标键 = 源键对应的值 类型为str } @param condition: 导出满足什么样条件的数据 默认是read_status = 0 的 @param datas: 要导出的数据,格式为[{...},{...}] 或者 {}用于直接将json数组导入到目标表,为空时默认导出mongodb的数据 @param callback 导出数据的回调,导出一组,执行一次,callback(execute_type, sql) execute_type为执行类型(ExportData.INSERT、ExportData.UPDATE、ExportData.EXCEPTION) sql 为执行的语句 --------- @result: ''' super(ExportData, self).__init__() self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._update_read_status = update_read_status self._condition = condition self._mongodb = MongoDB() if self._source_table else '' self._datas = datas self._sync_to_es = sync_to_es self._callback = callback self._is_oracle = False self._is_set_unique_key = False self._is_set_unique_key = False self._export_count = 0 self._update_count = 0 self._unique_key_mapping_source_key = unique_key_mapping_source_key def export_to_oracle(self, source_table='', aim_table='', key_map='', unique_key=None, unique_key_mapping_source_key=None, update_read_status=True, condition={'read_status': 0}, datas=[], callback='', sync_to_es=False): if aim_table: if self._aim_table != aim_table: self._is_set_unique_key = False self._es = ES() if sync_to_es else '' self._mongodb = MongoDB() if source_table else '' self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._export_count = 0 self._update_count = 0 self._unique_key_mapping_source_key = unique_key_mapping_source_key self._update_read_status = update_read_status if not datas else False self._condition = condition self._datas = datas self._callback = callback self._sync_to_es = sync_to_es self._es = None self._aim_db = OracleDB() self._is_oracle = True return self.__export() def export_to_mysql(self, source_table='', aim_table='', key_map='', unique_key=None, unique_key_mapping_source_key=None, update_read_status=True, condition={'read_status': 0}, datas=[], callback=''): if self._aim_table != aim_table: self._is_set_unique_key = False self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._export_count = 0 self._update_count = 0 self._unique_key_mapping_source_key = unique_key_mapping_source_key self._update_read_status = update_read_status if not datas else False self._condition = condition self._datas = datas self._callback = callback self._aim_db = MysqlDB() return self.__export() def make_sql(self, data): ''' @summary: --------- @param data: 数据字典 --------- @result: 当unique_key_mapping_source_key不为空时返回insert_sql, update_sql 否则返回insert_sql ''' aim_keys = tuple(self._key_map.keys()) source_keys = tuple(self._key_map.values()) # 取源key值 对应的type 和 key (源key包含type 和 key 信息) keys = [] value_types = [] for source_key in source_keys: temp_var = source_key.split('_', 1) value_types.append(temp_var[0]) keys.append(temp_var[1]) insert_sql = 'insert into ' + self._aim_table + " (" + ', '.join( aim_keys) + ") values (" update_sql = 'update ' + self._aim_table + " set " data_json = {} # 导入到es中用 values = [] for i in range(len(keys)): if (value_types[i] != 'vint' and value_types[i] != 'vstr' and value_types[i] != 'vdate' and value_types[i] != 'sint' and value_types[i] != 'sstr') and (not data[keys[i]] and data[keys[i]] != 0): values.append('null') insert_sql += '%s, ' update_sql += aim_keys[i] + " = %s, " % values[-1] data_json[aim_keys[i].upper()] = None elif value_types[i] == 'str': values.append( str(data[keys[i]]).replace("'", "''") ) # if isinstance(data[keys[i]], str) else data[keys[i]]) # 将单引号替换成两个单引号 否者insert_sql语句语法出错 insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'clob': text = str(data[keys[i]]).replace("'", "''") if not text: insert_sql += "'%s', " values.append(text) update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = None else: values_ = tools.cut_string(text, 1000) clob_text = '' for value in values_: clob_text += "to_clob('%s') || " % value clob_text = clob_text[:-len(' || ')] values.append(clob_text) insert_sql += "%s, " update_sql += aim_keys[i] + " = %s, " % values[-1] data_json[aim_keys[i].upper()] = data[keys[i]] elif value_types[i] == 'int': if isinstance(data[keys[i]], int) or isinstance( data[keys[i]], float) or isinstance( data[keys[i]], str): values.append(data[keys[i]]) elif isinstance(data[keys[i]], bool): values.append(data[keys[i]] and 1 or 0) else: # _id values.append(int(str(data[keys[i]])[-6:], 16)) insert_sql += '%s, ' update_sql += aim_keys[i] + " = %s, " % values[-1] data_json[aim_keys[i].upper()] = eval( values[-1]) if isinstance(values[-1], str) else values[-1] elif value_types[i] == 'date': values.append(data[keys[i]].replace('年', '-').replace( '月', '-').replace('日', '')) if self._is_oracle: format_date = 'yyyy-mm-dd hh24:mi:ss'[:len( values[-1]) if len(values[-1]) <= 10 else None] insert_sql += "to_date('%s','{}'), ".format(format_date) update_sql += aim_keys[i] + "= to_date('%s','%s'), " % ( values[-1], format_date) data_json[aim_keys[i].upper()] = values[-1] else: insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'vint': if tools.get_english_words(keys[i]): sql = 'select %s from dual' % keys[i] value = self._aim_db.find(sql)[0][0] values.append(value) data_json[aim_keys[i].upper()] = values[-1] else: values.append(keys[i]) data_json[aim_keys[i].upper()] = eval(values[-1]) insert_sql += '%s, ' update_sql += aim_keys[i] + " = %s, " % values[-1] elif value_types[i] == 'vstr': values.append(keys[i]) insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'vdate': values.append(keys[i]) if self._is_oracle: format_date = 'yyyy-mm-dd hh24:mi:ss'[:len( values[-1]) if len(values[-1]) <= 10 else None] insert_sql += "to_date('%s','{}'), ".format(format_date) update_sql += aim_keys[i] + "= to_date('%s','%s'), " % ( values[-1], format_date) data_json[aim_keys[i].upper()] = values[-1] else: insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % values[-1] data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'sint': value = self._aim_db.find(keys[i], fetch_one=True)[0] values.append(value) insert_sql += '%s, ' update_sql += aim_keys[i] + " = %s, " % value data_json[aim_keys[i].upper()] = values[-1] elif value_types[i] == 'sstr': value = self._aim_db.find(keys[i], fetch_one=True)[0] values.append(value) insert_sql += "'%s', " update_sql += aim_keys[i] + " = '%s', " % value data_json[aim_keys[i].upper()] = values[-1] else: error_msg = '%s不符合key_map规定格式' % value_types[i] raise (Exception(error_msg)) insert_sql = insert_sql[:-2] + ")" insert_sql = insert_sql % tuple(values) # tools.print(data_json) # log.debug(insert_sql) if self._unique_key_mapping_source_key: # aim_key = tuple(self._unique_key_mapping_source_key.keys())[0] # value = tuple(self._unique_key_mapping_source_key.values())[0] # temp_var = value.split('_', 1) # source_key_types = temp_var[0] # source_key = temp_var[1] # if source_key_types == 'str': # update_sql = update_sql[:-2] + " where %s = '%s'" %(aim_key, data[source_key]) # elif source_key_types == 'int': # update_sql = update_sql[:-2] + " where %s = %s" %(aim_key, data[source_key]) # # log.debug(update_sql) return insert_sql, update_sql[:-2], data_json else: return insert_sql, data_json # @tools.run_safe_model(__name__) def __export(self): if self._unique_key and not self._is_set_unique_key: self._aim_db.set_unique_key(self._aim_table, self._unique_key) self._is_set_unique_key = True datas = self._mongodb.find( self._source_table, condition=self._condition) if self._mongodb else ( self._datas if isinstance(self._datas, list) else [self._datas]) for data in datas: if self._unique_key_mapping_source_key: insert_sql, update_sql, data_json = self.make_sql(data) else: insert_sql, data_json = self.make_sql(data) # tools.write_file(self._aim_table + '.txt', insert_sql, 'w+') def exception_callfunc(e): if 'ORA-00001' in str(e): if self._update_read_status: self._mongodb.update(self._source_table, data, {'read_status': 1}) else: log.error(insert_sql) execute_type = ExportData.EXCEPTION sql = '' # log.debug(insert_sql) if self._aim_db.add(insert_sql, exception_callfunc): self._export_count += 1 sql = insert_sql execute_type = ExportData.INSERT if self._update_read_status: self._mongodb.update(self._source_table, data, {'read_status': 1}) elif self._unique_key_mapping_source_key: # 取id字段 aim_key = tuple(self._unique_key_mapping_source_key.keys())[0] value = tuple(self._unique_key_mapping_source_key.values())[0] temp_var = value.split('_', 1) source_key_types = temp_var[0] source_key = temp_var[1] select_sql = 'select id from ' + self._aim_table if source_key_types == 'str': select_sql = select_sql + " where %s = '%s'" % ( aim_key, data[source_key]) elif source_key_types == 'int': select_sql = select_sql + " where %s = %s" % ( aim_key, data[source_key]) data_id = self._aim_db.find(select_sql) if data_id: data_id = data_id[0][0] else: continue #拼接update语句 update_sql += " where id = %s" % data_id log.debug(update_sql) # 删除 update 里面 id= xxx 的条件,保证更新后的数据 ID不变 id_info = ''.join( tools.get_info(update_sql, [' id .*?,', ' ID .*?,'])) update_sql = update_sql.replace(id_info, '') # 修改data_json 里的ID if "ID" in data_json.keys(): data_json["ID"] = data_id # 更新 if self._aim_db.update(update_sql): self._update_count += 1 sql = update_sql execute_type = ExportData.UPDATE if self._update_read_status: self._mongodb.update(self._source_table, data, {'read_status': 1}) # 同步到ES if self._sync_to_es and execute_type != ExportData.EXCEPTION: self._es.add(table=self._aim_table, data=data_json, data_id=data_json.get('ID')) if self._callback: self._callback(execute_type, sql, data_json) log.debug(''' 共导出%s条数据 共更新%s条数据 ''' % (self._export_count, self._update_count)) return self._export_count + self._update_count def close(self): self._aim_db.close()
class Collector(threading.Thread): def __init__(self, tab_urls): super(Collector, self).__init__() self._lock = threading.RLock() self._db = MongoDB() self._thread_stop = False self._urls = [] self._null_times = 0 self._tab_urls = tab_urls self._depth = int( tools.get_conf_value('config.conf', "collector", "depth")) self._max_size = int( tools.get_conf_value('config.conf', "collector", "max_size")) self._interval = int( tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int( tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int( tools.get_conf_value('config.conf', "collector", "url_count")) #初始时将正在做的任务至为未做 self._db.update(self._tab_urls, {'status': Constance.DOING}, {'status': Constance.TODO}) self._finished_callback = None def run(self): while not self._thread_stop: self.__input_data() time.sleep(self._interval) def stop(self): self._thread_stop = True if self._finished_callback: self._finished_callback() # @tools.log_function_time def __input_data(self): if len(self._urls) > self._url_count: return urls_list = [] if self._depth: urls_list = self._db.find(self._tab_urls, { "status": Constance.TODO, "depth": { "$lte": self._depth } }, limit=self._url_count) else: urls_list = self._db.find(self._tab_urls, {"status": Constance.TODO}, limit=self._url_count) #更新已取到的url状态为doing for url in urls_list: self._db.update(self._tab_urls, url, {'status': Constance.DOING}) # 存url self.put_urls(urls_list) if self.is_all_have_done(): print('is_all_have_done') self.stop() def is_finished(self): return self._thread_stop def add_finished_callback(self, callback): self._finished_callback = callback # 没有可做的url def is_all_have_done(self): print('判断是否有未做的url ') if len(self._urls) == 0: self._null_times += 1 if self._null_times >= self._allowed_null_times: #检查数据库中有没有正在做的url urls_doing = self._db.find(self._tab_urls, {'status': Constance.DOING}) if urls_doing: # 如果有未做的url 且数量有变化,说明没有卡死 print('有未做的url %s' % len(urls_doing)) self._null_times = 0 return False else: return True else: return False else: self._null_times = 0 return False # @tools.log_function_time def put_urls(self, urls_list): self._urls.extend(urls_list) # @tools.log_function_time def get_urls(self, count): self._lock.acquire() #加锁 urls = self._urls[:count] del self._urls[:count] self._lock.release() return urls