class BiliaprioriPipeline(object): def __init__(self): self.client = Client("http://fantome:50070") def process_item(self, item, spider): print("Get Tag", " tag:", item['tagInfo']) #持续写入 self.client.write('/bili_3-7day/tagInfo.txt', item['tagInfo'] + "\n", overwrite=False, append=True, encoding="utf-8")
class HDFSUtil: def __init__(self, url): self._client = Client(url) def make_dir(self, hdfs_path): """ 支持递归创建多级目录 :param hdfs_path: :return: """ self._client.makedirs(hdfs_path) def delete_hdfs_file(self, hdfs_path): """ 删除HDFS文件 如果是目录, 必须为空 :param hdfs_path: :return: """ self._client.delete(hdfs_path) def delete_hdfs_dir(self, hdfs_dir): """ 删除HDFS文件/目录 如果目录不为空, 递归删除 :param hdfs_dir: :return: """ dir_list = self.hdfs_dir_list(hdfs_dir) if dir_list is None or len(dir_list) == 0: print('Delete File: {0}'.format(hdfs_dir)) self._client.delete(hdfs_dir) else: for file_name in dir_list: self.delete_hdfs_dir(hdfs_dir + '/' + file_name) print('Delete Dir: {0}'.format(hdfs_dir)) self._client.delete(hdfs_dir) def upload_to_hdfs(self, local_path, hdfs_path): """ 将本地文件/目录上传到HDFS上 如果目录不存在, 会自动创建 :param local_path: :param hdfs_path: :return: """ self._client.upload(hdfs_path, local_path, cleanup=True) def download_from_hdfs(self, hdfs_path, local_path): """ 将HDFS上的文件/目录下载到本地 :param hdfs_path: :param local_path: :return: """ self._client.download(hdfs_path, local_path, overwrite=True) def write_to_hdfs(self, hdfs_path, data, overwrite=False, append=True): """ 追加: overwrite=false, append=true => Default 复写: overwrite=true, append=false overwrite和append逻辑必须互斥 :param hdfs_path: :param data: :param overwrite: Boolean 是否复写 :param append: Boolean 是否追加 :return: """ if not self._client.content(hdfs_path, strict=False): print('File Not exist in HDFS') self._client.write(hdfs_path, data, overwrite=overwrite, append=append) def move_or_rename(self, hdfs_src_path, hdfs_dst_path): """ 文件移动/重命名 :param hdfs_src_path: :param hdfs_dst_path: :return: """ self._client.rename(hdfs_src_path, hdfs_dst_path) def hdfs_dir_list(self, hdfs_path): """ 获取指定目录下的文件 当hdfs_path不是目录, 捕获异常并返回None :param hdfs_path: :return: List[filename] or None """ try: return self._client.list(hdfs_path, status=False) except HdfsError: return None
class HdfsPipeline(object): def __init__(self, **kwargs): self.table_cols_map = {} # 表字段顺序 {table:(cols, col_default)} self.bizdate = bizdate # 业务日期为启动爬虫的日期 self.buckets_map = {} # 桶 {table:items} self.bucketsize = kwargs.get('BUCKETSIZE') self.client = Client(kwargs.get('HDFS_URLS')) self.dir = kwargs.get('HDFS_FOLDER') # 文件夹路径 self.delimiter = kwargs.get('HDFS_DELIMITER') # 列分隔符,默认 hive默认分隔符 self.encoding = kwargs.get('HDFS_ENCODING') # 文件编码,默认 'utf-8' self.hive_host = kwargs.get('HIVE_HOST') self.hive_port = kwargs.get('HIVE_PORT') self.hive_dbname = kwargs.get('HIVE_DBNAME') # 数据库名称 self.hive_auto_create = kwargs.get('HIVE_AUTO_CREATE', False) # hive 是否自动建表,默认 False self.client.makedirs(self.dir) @classmethod def from_crawler(cls, crawler): settings = crawler.settings return cls(**settings) def process_item(self, item, spider): """ :param item: :param spider: :return: 数据分表入库 """ if item.tablename in self.buckets_map: self.buckets_map[item.tablename].append(item) else: cols, col_default = [], {} for field, value in item.fields.items(): cols.append(field) col_default[field] = item.fields[field].get('default', '') cols.sort(key=lambda x: item.fields[x].get('idx', 1)) self.table_cols_map.setdefault( item.tablename, (cols, col_default)) # 定义表结构、字段顺序、默认值 self.buckets_map.setdefault(item.tablename, [item]) if self.hive_auto_create: self.checktable(item.tablename, cols) # 建表 self.buckets2db(bucketsize=self.bucketsize, spider_name=spider.name) # 将满足条件的桶 入库 return item def close_spider(self, spider): """ :param spider: :return: 爬虫结束时,将桶里面剩下的数据 入库 """ self.buckets2db(bucketsize=1, spider_name=spider.name) def checktable(self, tbname, cols): """ :return: 创建 hive 表 """ hive = CtrlHive(self.hive_host, self.hive_port, self.hive_dbname) cols = ['keyid'] + cols + ['bizdate', 'ctime', 'spider'] create_sql = f"create table if not exists {tbname}({' string,'.join(cols)} string)" hive.execute(create_sql) logger.info(f"表创建成功 <= 表名:{tbname}") def buckets2db(self, bucketsize=100, spider_name=''): """ :param bucketsize: 桶大小 :param spider_name: 爬虫名字 :return: 遍历每个桶,将满足条件的桶,入库并清空桶 """ for tablename, items in self.buckets_map.items( ): # 遍历每个桶,将满足条件的桶,入库并清空桶 if len(items) >= bucketsize: new_items = [] cols, col_default = self.table_cols_map.get(tablename) for item in items: keyid = rowkey() new_item = {'keyid': keyid} for field in cols: value = item.get(field, col_default.get(field)) new_item[field] = str(value).replace( self.delimiter, '').replace('\n', '') new_item['bizdate'] = self.bizdate # 增加非业务字段 new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) new_item['spider'] = spider_name value = self.delimiter.join(new_item.values()) new_items.append(value) # 每张表是都是一个文件夹 folder = f"{self.dir}/{tablename}" self.client.makedirs(folder) filename = f"{folder}/data.txt" info = self.client.status(filename, strict=False) if not info: self.client.write(filename, data='', overwrite=True, encoding=self.encoding) try: content = '\n'.join(new_items) + '\n' self.client.write(filename, data=content, overwrite=False, append=True, encoding=self.encoding) logger.info(f"保存成功 <= 文件名:{filename} 记录数:{len(items)}") items.clear() # 清空桶 except Exception as e: logger.error(f"保存失败 <= 文件名:{filename} 错误原因:{e}")
file_dir = '/tmp/way' file_name = '/tmp/way/test.txt' file_name2 = '/tmp/way/test123.txt' loacl_file_name = 'test.txt' client = Client(HDFS_ClIENT) # 创建文件夹 client.makedirs(file_dir) # 返回目标信息 info = client.status(file_name, strict=False) print(info) # 写入文件(覆盖) client.write(file_name, data="hello hdfs !", overwrite=True) # 写入文件(追加) client.write(file_name, data="hello way !", overwrite=False, append=True) # 读取文件内容 with client.read(file_name, encoding='utf-8') as f: print(f.read()) # 文件下载 client.download(file_name, loacl_file_name, overwrite=True) # 文件上传 client.upload(file_name + '111', loacl_file_name, cleanup=True) # 删除文件