def upload(name, file_path, config): env_prefix = config.get("prefix", None) hdfs_client = Client(url=config["hdfs"]["name_node"]) hdfs_hosts = [] hdfs_http_host = config["hdfs"]["name_node"] hdfs_hosts.append(hdfs_http_host.replace("http://", "")) hdfs_data_service_root = "/data_service" if env_prefix is not None: hdfs_data_service_root = "/{0}_data_service".format(env_prefix) hdfs_client.makedirs(hdfs_data_service_root) timestamp = int(round(time.time() * 1000)) target_file_name = "{2}/{0}/{1}/{0}_{1}.py".format( name, str(timestamp), hdfs_data_service_root) hdfs_client.makedirs("{2}/{0}/{1}".format(name, str(timestamp), hdfs_data_service_root)) print("hdfs file name: {0}".format(target_file_name)) hdfs_client.upload(target_file_name, file_path) zip_path = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "joowing.zip") target_zp_file_name = "{2}/{0}/{1}/joowing.zip".format( name, str(timestamp), hdfs_data_service_root) # hdfs_client.upload(target_zp_file_name, zip_path) # return target_file_name, target_zp_file_name return target_file_name
class HdfsPipeline(object): def __init__(self, **kwargs): self.table_cols_map = {} # 表字段顺序 {table:(cols, col_default)} self.bizdate = bizdate # 业务日期为启动爬虫的日期 self.buckets_map = {} # 桶 {table:items} self.bucketsize = kwargs.get('BUCKETSIZE') self.client = Client(kwargs.get('HDFS_URLS')) self.dir = kwargs.get('HDFS_FOLDER') # 文件夹路径 self.delimiter = kwargs.get('HDFS_DELIMITER') # 列分隔符,默认 hive默认分隔符 self.encoding = kwargs.get('HDFS_ENCODING') # 文件编码,默认 'utf-8' self.hive_host = kwargs.get('HIVE_HOST') self.hive_port = kwargs.get('HIVE_PORT') self.hive_dbname = kwargs.get('HIVE_DBNAME') # 数据库名称 self.hive_auto_create = kwargs.get('HIVE_AUTO_CREATE', False) # hive 是否自动建表,默认 False self.client.makedirs(self.dir) @classmethod def from_crawler(cls, crawler): settings = crawler.settings return cls(**settings) def process_item(self, item, spider): """ :param item: :param spider: :return: 数据分表入库 """ if item.tablename in self.buckets_map: self.buckets_map[item.tablename].append(item) else: cols, col_default = [], {} for field, value in item.fields.items(): cols.append(field) col_default[field] = item.fields[field].get('default', '') cols.sort(key=lambda x: item.fields[x].get('idx', 1)) self.table_cols_map.setdefault( item.tablename, (cols, col_default)) # 定义表结构、字段顺序、默认值 self.buckets_map.setdefault(item.tablename, [item]) if self.hive_auto_create: self.checktable(item.tablename, cols) # 建表 self.buckets2db(bucketsize=self.bucketsize, spider_name=spider.name) # 将满足条件的桶 入库 return item def close_spider(self, spider): """ :param spider: :return: 爬虫结束时,将桶里面剩下的数据 入库 """ self.buckets2db(bucketsize=1, spider_name=spider.name) def checktable(self, tbname, cols): """ :return: 创建 hive 表 """ hive = CtrlHive(self.hive_host, self.hive_port, self.hive_dbname) cols = ['keyid'] + cols + ['bizdate', 'ctime', 'spider'] create_sql = f"create table if not exists {tbname}({' string,'.join(cols)} string)" hive.execute(create_sql) logger.info(f"表创建成功 <= 表名:{tbname}") def buckets2db(self, bucketsize=100, spider_name=''): """ :param bucketsize: 桶大小 :param spider_name: 爬虫名字 :return: 遍历每个桶,将满足条件的桶,入库并清空桶 """ for tablename, items in self.buckets_map.items( ): # 遍历每个桶,将满足条件的桶,入库并清空桶 if len(items) >= bucketsize: new_items = [] cols, col_default = self.table_cols_map.get(tablename) for item in items: keyid = rowkey() new_item = {'keyid': keyid} for field in cols: value = item.get(field, col_default.get(field)) new_item[field] = str(value).replace( self.delimiter, '').replace('\n', '') new_item['bizdate'] = self.bizdate # 增加非业务字段 new_item['ctime'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) new_item['spider'] = spider_name value = self.delimiter.join(new_item.values()) new_items.append(value) # 每张表是都是一个文件夹 folder = f"{self.dir}/{tablename}" self.client.makedirs(folder) filename = f"{folder}/data.txt" info = self.client.status(filename, strict=False) if not info: self.client.write(filename, data='', overwrite=True, encoding=self.encoding) try: content = '\n'.join(new_items) + '\n' self.client.write(filename, data=content, overwrite=False, append=True, encoding=self.encoding) logger.info(f"保存成功 <= 文件名:{filename} 记录数:{len(items)}") items.clear() # 清空桶 except Exception as e: logger.error(f"保存失败 <= 文件名:{filename} 错误原因:{e}")
class HDFSUtil: def __init__(self, url): self._client = Client(url) def make_dir(self, hdfs_path): """ 支持递归创建多级目录 :param hdfs_path: :return: """ self._client.makedirs(hdfs_path) def delete_hdfs_file(self, hdfs_path): """ 删除HDFS文件 如果是目录, 必须为空 :param hdfs_path: :return: """ self._client.delete(hdfs_path) def delete_hdfs_dir(self, hdfs_dir): """ 删除HDFS文件/目录 如果目录不为空, 递归删除 :param hdfs_dir: :return: """ dir_list = self.hdfs_dir_list(hdfs_dir) if dir_list is None or len(dir_list) == 0: print('Delete File: {0}'.format(hdfs_dir)) self._client.delete(hdfs_dir) else: for file_name in dir_list: self.delete_hdfs_dir(hdfs_dir + '/' + file_name) print('Delete Dir: {0}'.format(hdfs_dir)) self._client.delete(hdfs_dir) def upload_to_hdfs(self, local_path, hdfs_path): """ 将本地文件/目录上传到HDFS上 如果目录不存在, 会自动创建 :param local_path: :param hdfs_path: :return: """ self._client.upload(hdfs_path, local_path, cleanup=True) def download_from_hdfs(self, hdfs_path, local_path): """ 将HDFS上的文件/目录下载到本地 :param hdfs_path: :param local_path: :return: """ self._client.download(hdfs_path, local_path, overwrite=True) def write_to_hdfs(self, hdfs_path, data, overwrite=False, append=True): """ 追加: overwrite=false, append=true => Default 复写: overwrite=true, append=false overwrite和append逻辑必须互斥 :param hdfs_path: :param data: :param overwrite: Boolean 是否复写 :param append: Boolean 是否追加 :return: """ if not self._client.content(hdfs_path, strict=False): print('File Not exist in HDFS') self._client.write(hdfs_path, data, overwrite=overwrite, append=append) def move_or_rename(self, hdfs_src_path, hdfs_dst_path): """ 文件移动/重命名 :param hdfs_src_path: :param hdfs_dst_path: :return: """ self._client.rename(hdfs_src_path, hdfs_dst_path) def hdfs_dir_list(self, hdfs_path): """ 获取指定目录下的文件 当hdfs_path不是目录, 捕获异常并返回None :param hdfs_path: :return: List[filename] or None """ try: return self._client.list(hdfs_path, status=False) except HdfsError: return None
# @Site : # @Describe: 操作 hdfs from hdfs import Client HDFS_ClIENT = "http://172.16.122.21:50070;http://172.16.122.24:50070" file_dir = '/tmp/way' file_name = '/tmp/way/test.txt' file_name2 = '/tmp/way/test123.txt' loacl_file_name = 'test.txt' client = Client(HDFS_ClIENT) # 创建文件夹 client.makedirs(file_dir) # 返回目标信息 info = client.status(file_name, strict=False) print(info) # 写入文件(覆盖) client.write(file_name, data="hello hdfs !", overwrite=True) # 写入文件(追加) client.write(file_name, data="hello way !", overwrite=False, append=True) # 读取文件内容 with client.read(file_name, encoding='utf-8') as f: print(f.read())
class OpenPAI: "client for OpenPAI" def __init__(self, config: dict = None, file: str = 'openpai.json'): """config should contain - rest_server_socket - hdfs_web_socket - user - password """ if config is None: with open(file) as fn: config = json.load(fn) for key in [ 'rest_server_socket', 'hdfs_web_socket', 'user', 'password' ]: assert key in config, '%s is not defined for OpenPAI' % (key) for key in ['rest_server_socket', 'hdfs_web_socket']: assert config[key].startswith( 'http://'), '%s should have http prefix' % (key) self.rest_server_socket = config['rest_server_socket'] self.hdfs_client = Client(config['hdfs_web_socket']) self.config = config def __get_token(self, user: str, password: str): try: response = request( 'POST', parse.urljoin(self.rest_server_socket, 'token'), headers={ 'Content-Type': 'application/x-www-form-urlencoded', }, data='username={}&password={}&expiration=30000'.format( user, password), ) if response.status_code == 200: return response.json()['token'] raise Exception(response.reason) except Exception as identifier: raise Exception(identifier) def upload(self, local_path, hdfs_dir, overwrite=True): try: self.hdfs_client.upload(hdfs_dir, local_path, overwrite=overwrite) return True except Exception as e: print(e) return False def mkdir(self, hdfs_dir): try: self.hdfs_client.makedirs(hdfs_dir) return True except Exception as e: return False def submit_job(self, job_config): self.token = self.__get_token(self.config['user'], self.config['password']) response = request('POST', parse.urljoin(self.rest_server_socket, 'jobs'), headers={ 'Authorization': 'Bearer ' + self.token, 'Content-Type': 'application/json' }, json=job_config) if response.status_code != 200 and response.status_code != 202: raise Exception(response.reason)
class RF_HDFS(object): def __init__(self): self.client = None self.directory = None def connect_and_login(self, **kwargs): import requests host = None port = None user = None password = None root = None timeout = None proxy = None if 'host' in kwargs: host = kwargs['host'] if 'port' in kwargs: port = kwargs['port'] if 'kdc' in kwargs: kdc = kwargs['kdc'] if 'user' in kwargs: user = kwargs['user'] if 'password' in kwargs: password = kwargs['password'] if 'root' in kwargs: root = kwargs['root'] if 'proxy' in kwargs: proxy = kwargs['proxy'] if 'timeout' in kwargs: timeout = kwargs['timeout'] self.session = requests.Session() adapter = requests.adapters.HTTPAdapter(pool_maxsize=0) self.session.mount('http://', adapter) self.session.mount('https://', adapter) self.session.headers.update({'Connection':'Keep-Alive'}) self.connectionStatus = False try: timeout = int(timeout) url = "http://" + host + ":" + str(port) hdfsLogin = WebHDFS(url, kdc) cookieStr = hdfsLogin.authenticate(user, password) if cookieStr != None: cookieList = cookieStr.split('=', 1) cookieDict = {cookieList[0]: cookieList[1]} requests.utils.add_dict_to_cookiejar(self.session.cookies, cookieDict) self.client = Client(url, root=root, proxy=proxy, timeout=timeout, session=self.session) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) self.connectionStatus = True return self.client def checkConnectionStatus(self): return self.connectionStatus def list_dir(self, directory): output = [] try: if directory != None: output = self.client.list(directory, status=True) else: output = self.client.list(self.client.root, status=True) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def list_names(self, directory): output = [] try: if directory != None: output = self.client.list(directory, status=False) else: output = self.client.list(self.client.root, status=False) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def upload(self, remote_path, local_path, overwrite=False, permission=None): output = None try: output = self.client.upload(remote_path, local_path, overwrite, permission=permission) except HdfsError as hdfsError: # For some reason this exception includes the entire stack trace after # the error message, so split on '\n' and only return the first line. error = str(hdfsError).splitlines()[0] raise HdfsLibraryError(error) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def download(self, remote_path, local_path, overwrite=False): output = None try: output = self.client.download(remote_path, local_path, overwrite) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def mkdir(self, directory, permission): try: # no return value self.client.makedirs(directory, permission=permission) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def rmdir(self, directory): try: # no return value if self.client.delete(directory, recursive=True) == False: raise HdfsLibraryError("Directory does not exist: %r", directory) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def rename(self, src_file, dst_file): try: # no return value self.client.rename(src_file, dst_file) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def delete(self, file): try: # no return value if self.client.delete(file) == False: raise HdfsLibraryError("File does not exist: %r", file) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_time(self, file, mod_time): try: # no return value self.client.set_times(file, -1, mod_time) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_owner(self, file, owner, group): try: # no return value self.client.set_owner(file, owner=owner, group=group) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_permission(self, file, permission): try: # no return value self.client.set_permission(file, permission=permission) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_acl(self, file, aclspec): try: # no return value self.client.set_acl(file, aclspec=aclspec) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def status(self, path): output = '' try: output = self.client.status(path) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def checksum(self, path): output = '' try: output = self.client.checksum(path) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def close(self): self.session.close()