def buckets2db(self, bucketsize=100, spider_name=''): """ :param bucketsize: 桶大小 :param spider_name: 爬虫名字 :return: 遍历每个桶,将满足条件的桶,入库并清空桶 """ for tablename, items in self.buckets_map.items( ): # 遍历每个桶,将满足条件的桶,入库并清空桶 if len(items) >= bucketsize: new_items = [] cols, col_default, col_type = self.table_cols_map.get( tablename) for item in items: keyid = rowkey() new_item = {'keyid': keyid} for field in cols: value = item.get(field, col_default.get(field)) new_item[field] = str(value) new_item['bizdate'] = self.bizdate # 增加非业务字段 new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) new_item['spider'] = spider_name new_items.append(new_item) try: df = pd.DataFrame(new_items) df.to_sql(tablename, con=self.engine, index=False, if_exists='append', dtype=col_type) logger.info(f"入库成功 <= 表名:{tablename} 记录数:{len(items)}") items.clear() # 清空桶 except Exception as e: logger.error(f"入库失败 <= 表名:{tablename} 错误原因:{e}")
def buckets2db(self, bucketsize=100, spider_name=''): """ :param bucketsize: 桶大小 :param spider_name: 爬虫名字 :return: 遍历每个桶,将满足条件的桶,入库并清空桶 """ for tablename, items in self.buckets_map.items( ): # 遍历每个桶,将满足条件的桶,入库并清空桶 if len(items) >= bucketsize: new_items = [] for item in items: keyid = rowkey() new_item = {'keyid': keyid} for key, value in item.items(): new_item[key] = value new_item['bizdate'] = self.bizdate # 增加非业务字段 new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) new_item['spider'] = spider_name new_items.append(new_item) try: dtypedict, columns = self.get_struct( new_items[0], self.col_type) df = pd.DataFrame(new_items, columns=columns) df.to_sql(tablename, con=self.engine, index=False, if_exists='append', dtype=dtypedict) logger.info(f"入库成功 <= 表名:{tablename} 记录数:{len(items)}") items.clear() # 清空桶 except Exception as e: logger.error(f"入库失败 <= 表名:{tablename} 错误原因:{e}")
def buckets2db(self, bucketsize=100, spider_name=''): """ :param bucketsize: 桶大小 :param spider_name: 爬虫名字 :return: 遍历每个桶,将满足条件的桶,入库并清空桶 """ for tablename, items in self.buckets_map.items( ): # 遍历每个桶,将满足条件的桶,入库并清空桶 if len(items) >= bucketsize: connection = self.get_connect() table = connection.table(tablename) bat = table.batch() for item in items: keyid = rowkey() values = {} for key, value in item.items(): values['cf:' + key] = value values['cf:bizdate'] = self.bizdate # 增加非业务字段 values['cf:ctime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) values['cf:spider'] = spider_name bat.put(keyid, values) # 将清洗后的桶数据 添加到批次 try: bat.send() # 批次入库 logger.info(f"入库成功 <= 表名:{tablename} 记录数:{len(items)}") items.clear() # 清空桶 except Exception as e: logger.error(f"入库失败 <= 表名:{tablename} 错误原因:{e}") finally: connection.close()
def buckets2db(self, bucketsize=100, spider_name=''): """ :param bucketsize: 桶大小 :param spider_name: 爬虫名字 :return: 遍历每个桶,将满足条件的桶,入库并清空桶 """ for tablename, items in self.buckets_map.items( ): # 遍历每个桶,将满足条件的桶,入库并清空桶 if len(items) >= bucketsize: new_items = [] for item in items: keyid = rowkey() new_item = {'_id': keyid} for key, value in item.items(): new_item[key] = value new_item['bizdate'] = self.bizdate # 增加非业务字段 new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) new_item['spider'] = spider_name new_items.append(new_item) try: self.mongodb[tablename].insert_many(new_items) logger.info(f"入库成功 <= 表名:{tablename} 记录数:{len(items)}") items.clear() # 清空桶 except Exception as e: logger.error(f"入库失败 <= 表名:{tablename} 错误原因:{e}")
def buckets2db(self, bucketsize=None): """ :param bucketsize: 桶大小 :return: 遍历每个桶,将满足条件的桶,入库并清空桶 """ if bucketsize is None: bucketsize = self.bucketsize for tablename, items in self.buckets_map.items( ): # 遍历每个桶,将满足条件的桶,入库并清空桶 if len(items) >= bucketsize: cols, col_default = self.table_cols_map.get(tablename) connection = self.get_connect() table = connection.table(tablename) bat = table.batch() for item in items: keyid = rowkey() values = {} for field in cols: value = item.get(field, col_default.get(field)) values['cf:' + field] = str(value) values['cf:bizdate'] = self.bizdate # 增加非业务字段 values['cf:ctime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) values['cf:spider'] = self.name bat.put(keyid, values) # 将清洗后的桶数据 添加到批次 try: bat.send() # 批次入库 logger.info(f"入库成功 <= 表名:{tablename} 记录数:{len(items)}") items.clear() # 清空桶 except Exception as e: logger.error(f"入库失败 <= 表名:{tablename} 错误原因:{e}") finally: connection.close()
def buckets2db(self, bucketsize=None): """ :param bucketsize: 桶大小 :return: 遍历每个桶,将满足条件的桶,入库并清空桶 """ if bucketsize is None: bucketsize = self.bucketsize for tablename, items in self.buckets_map.items( ): # 遍历每个桶,将满足条件的桶,入库并清空桶 if len(items) >= bucketsize: new_items = [] cols, col_default = self.table_cols_map.get(tablename) for item in items: keyid = rowkey() new_item = {'_id': keyid} for field in cols: value = item.get(field, col_default.get(field)) new_item[field] = str(value) new_item['bizdate'] = self.bizdate # 增加非业务字段 new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) new_item['spider'] = self.name new_items.append(new_item) try: self.mongodb[tablename].insert_many(new_items) logger.info(f"入库成功 <= 表名:{tablename} 记录数:{len(items)}") items.clear() # 清空桶 except Exception as e: logger.error(f"入库失败 <= 表名:{tablename} 错误原因:{e}")
def buckets2db(self, bucketsize=None): """ :param bucketsize: 桶大小 :return: 遍历每个桶,将满足条件的桶,入库并清空桶 """ if bucketsize is None: bucketsize = self.bucketsize for tablename, items in self.buckets_map.items( ): # 遍历每个桶,将满足条件的桶,入库并清空桶 if len(items) >= bucketsize: header = '' new_items = [] cols, col_default = self.table_cols_map.get(tablename) for item in items: keyid = rowkey() new_item = {'keyid': keyid} for field in cols: value = item.get(field, col_default.get(field)) new_item[field] = str(value).replace( self.delimiter, '').replace('\n', '') new_item['bizdate'] = self.bizdate # 增加非业务字段 new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) new_item['spider'] = self.name if not header: header = self.delimiter.join(new_item.keys()) value = self.delimiter.join(new_item.values()) # value = self.delimiter.join([new_item[key] for key in header.split(self.delimiter)]) new_items.append(value) fielder = f"{self.dir}/{self.name}" os.makedirs(fielder, exist_ok=True) filename = f"{fielder}/{tablename}.{self.type}" if self.writeheader: if not os.path.exists(filename) or os.path.getsize( filename) == 0: with open(filename, 'w', encoding=self.encoding) as f: f.writelines(header + '\n') try: with open(filename, 'a', encoding=self.encoding) as f: f.write('\n'.join(new_items) + '\n') logger.info(f"保存成功 <= 文件名:{filename} 记录数:{len(items)}") except Exception as e: logger.error(f"保存失败 <= 文件名:{filename} 错误原因:{e}") logger.warning( f"重新保存 <= 文件名:{tablename} 当前批次保存异常, 自动切换成逐行保存...") for new_item in new_items: try: with open(filename, 'a', encoding=self.encoding) as f: f.write(new_item + '\n') logger.info(f"保存成功 <= 文件名:{tablename} 记录数:1") except Exception as e: logger.error(f"丢弃 <= 表名:{tablename} 丢弃原因:{e}") finally: items.clear() # 清空桶
def buckets2db(self, bucketsize=None): """ :param bucketsize: 桶大小 :return: 遍历每个桶,将满足条件的桶,入库并清空桶 """ if bucketsize is None: bucketsize = self.bucketsize for tablename, items in self.buckets_map.items( ): # 遍历每个桶,将满足条件的桶,入库并清空桶 if len(items) >= bucketsize: actions = [] cols, col_default = self.table_cols_map.get(tablename) for item in items: new_item = {} for field in cols: new_item[field] = item.get(field, col_default.get(field)) new_item['bizdate'] = self.bizdate # 增加非业务字段 new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) new_item['spider'] = self.name action = { '_op_type': 'index', # 操作 index update create delete '_index': self.name, # index '_id': rowkey(), '_type': tablename, # type '_source': new_item } actions.append(action) try: helpers.bulk(self.ES, actions=actions) logger.info( f"入库成功 <= 索引:{self.name}, 类型:{tablename} 记录数:{len(items)}" ) except Exception as e: logger.error( f"入库失败 <= 索引:{self.name}, 类型:{tablename} 错误原因:{e}") logger.warning( f"重新入库 <= 表名:{tablename} 当前批次入库异常, 自动切换成逐行入库...") for action in actions: try: helpers.bulk(self.ES, actions=[action]) logger.info( f"入库成功 <= 索引:{self.name}, 类型:{tablename} 记录数:1" ) except Exception as e: logger.error( f"丢弃 <= 索引:{self.name}, 类型:{tablename} 丢弃原因:{e}" ) finally: items.clear() # 清空桶
def buckets2db(self, bucketsize=None): """ :param bucketsize: 桶大小 :return: 遍历每个桶,将满足条件的桶,入库并清空桶 """ if bucketsize is None: bucketsize = self.bucketsize for tablename, items in self.buckets_map.items( ): # 遍历每个桶,将满足条件的桶,入库并清空桶 if len(items) >= bucketsize: new_items = [] cols, col_default, col_type = self.table_cols_map.get( tablename) for item in items: keyid = rowkey() new_item = {'keyid': keyid} for field in cols: value = item.get(field, col_default.get(field)) new_item[field] = str(value) new_item['bizdate'] = self.bizdate # 增加非业务字段 new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) new_item['spider'] = self.name new_items.append(new_item) try: df = pd.DataFrame(new_items) df.to_sql(tablename, con=self.engine, index=False, if_exists='append', dtype=col_type) logger.info(f"入库成功 <= 表名:{tablename} 记录数:{len(items)}") except Exception as e: logger.error(f"入库失败 <= 表名:{tablename} 错误原因:{e}") logger.warning( f"重新入库 <= 表名:{tablename} 当前批次入库异常, 自动切换成逐行入库...") for new_item in new_items: try: df = pd.DataFrame([new_item]) df.to_sql(tablename, con=self.engine, index=False, if_exists='append', dtype=col_type) logger.info(f"入库成功 <= 表名:{tablename} 记录数:1") except Exception as e: logger.error(f"丢弃 <= 表名:{tablename} 丢弃原因:{e}") finally: items.clear() # 清空桶
def buckets2db(self, bucketsize=None): """ :param bucketsize: 桶大小 :return: 遍历每个桶,将满足条件的桶,入库并清空桶 """ if bucketsize is None: bucketsize = self.bucketsize for tablename, items in self.buckets_map.items( ): # 遍历每个桶,将满足条件的桶,入库并清空桶 if len(items) >= bucketsize: new_items = [] cols, col_default = self.table_cols_map.get(tablename) for item in items: keyid = rowkey() new_item = {'keyid': keyid} for field in cols: value = item.get(field, col_default.get(field)) new_item[field] = str(value).replace( self.delimiter, '').replace('\n', '') new_item['bizdate'] = self.bizdate # 增加非业务字段 new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) new_item['spider'] = self.name value = self.delimiter.join(new_item.values()) new_items.append(value) # 每张表是都是一个文件夹 folder = f"{self.dir}/{tablename}" self.client.makedirs(folder) filename = f"{folder}/data.txt" info = self.client.status(filename, strict=False) if not info: self.client.write(filename, data='', overwrite=True, encoding=self.encoding) try: content = '\n'.join(new_items) + '\n' self.client.write(filename, data=content, overwrite=False, append=True, encoding=self.encoding) logger.info(f"保存成功 <= 文件名:{filename} 记录数:{len(items)}") items.clear() # 清空桶 except Exception as e: logger.error(f"保存失败 <= 文件名:{filename} 错误原因:{e}")