def download(hdfs_path, local_path): # 初始化HDFS连接 client = Client('http://fisher.lazybone.xyz:9001', root='/') if os.path.exists(local_path): print('文件已存在') return client.download(hdfs_path=hdfs_path, local_path=local_path)
def get(self, request): client = Client(HDFS_HOST) hdfs = request.GET.get('hdfs') file_name = DataSource.objects.get(format_filename=hdfs).file_name client.download('/datahoop/' + hdfs, settings.MEDIA_ROOT + 'hdfs_download') path = os.path.join(settings.MEDIA_ROOT, 'hdfs_download') file = open(os.path.join(path, hdfs), 'rb') response = FileResponse(file) response = HttpResponse(content_type='application/vnd.ms-csv') response['Content-Disposition'] = 'attachment; filename={}.csv'.format(file_name.split('.')[0]) return (response)
#daoru from hdfs import Client client = Client("http://master:9870") # client.makedirs("/abc/xyz") x = client.list("/") y = client.list("/", status=True) print(y[1][0]) print(y[1][1]["accessTime"]) client.upload("/abc", "HDFSDao.py") client.download("/abc/HDFSDao.py", "d:/ttt.py") print("end___")
class ChatBotModel(object): def __init__(self, hadoop_url, hdfs_index_file, local_index_file, corpus_dir, unk_answer='', max_answer_len=1024): self.hadoop_url = hadoop_url self.hdfs_index_file = hdfs_index_file self.local_index_file = local_index_file self.corpus_dir = corpus_dir self.max_answer_len = max_answer_len self.unk_answer = unk_answer self.client = None self.inverted_index = {} def build_connection(self): self.client = Client(self.hadoop_url) def fetch_index_file(self): self.client.download(hdfs_path=self.hdfs_index_file, local_path=self.local_index_file, overwrite=True) def load_inverted_index(self): with open(self.local_index_file, 'r', encoding='utf-8') as f: for line in f: word, *querys = line.strip().split('\t') for query in querys: file_name, query_id, score = query.split(':') if word in self.inverted_index: self.inverted_index[word].append( [file_name, int(query_id), float(score)]) else: self.inverted_index[word] = [] self.inverted_index[word].append( [file_name, int(query_id), float(score)]) def prepare(self): self.build_connection() self.fetch_index_file() self.load_inverted_index() def read_corpus_answer(self, file_name, query_id): file_path = os.path.join(self.corpus_dir, file_name) file_status = self.client.status(file_path) if file_status['length'] <= query_id: return None with self.client.read(hdfs_path=file_path, offset=query_id, length=self.max_answer_len, encoding='utf-8') as f: answer = f.read().strip().split('\n')[0] return answer def predict_answer(self, query): words = jieba.lcut_for_search(query) querys = {} for word in words: if word not in self.inverted_index: continue for file_name, query_id, score in self.inverted_index[word]: query = (file_name, query_id) if query in querys: querys[query] += score else: querys[query] = score if len(querys) == 0: return self.unk_answer best_query = max(querys.items(), key=lambda x: x[1]) (best_file_name, best_query_id), best_score = best_query best_answer = self.read_corpus_answer(best_file_name, best_query_id) if best_answer is None: best_answer = self.unk_answer return best_answer
class HDFSUtil: def __init__(self, url): self._client = Client(url) def make_dir(self, hdfs_path): """ 支持递归创建多级目录 :param hdfs_path: :return: """ self._client.makedirs(hdfs_path) def delete_hdfs_file(self, hdfs_path): """ 删除HDFS文件 如果是目录, 必须为空 :param hdfs_path: :return: """ self._client.delete(hdfs_path) def delete_hdfs_dir(self, hdfs_dir): """ 删除HDFS文件/目录 如果目录不为空, 递归删除 :param hdfs_dir: :return: """ dir_list = self.hdfs_dir_list(hdfs_dir) if dir_list is None or len(dir_list) == 0: print('Delete File: {0}'.format(hdfs_dir)) self._client.delete(hdfs_dir) else: for file_name in dir_list: self.delete_hdfs_dir(hdfs_dir + '/' + file_name) print('Delete Dir: {0}'.format(hdfs_dir)) self._client.delete(hdfs_dir) def upload_to_hdfs(self, local_path, hdfs_path): """ 将本地文件/目录上传到HDFS上 如果目录不存在, 会自动创建 :param local_path: :param hdfs_path: :return: """ self._client.upload(hdfs_path, local_path, cleanup=True) def download_from_hdfs(self, hdfs_path, local_path): """ 将HDFS上的文件/目录下载到本地 :param hdfs_path: :param local_path: :return: """ self._client.download(hdfs_path, local_path, overwrite=True) def write_to_hdfs(self, hdfs_path, data, overwrite=False, append=True): """ 追加: overwrite=false, append=true => Default 复写: overwrite=true, append=false overwrite和append逻辑必须互斥 :param hdfs_path: :param data: :param overwrite: Boolean 是否复写 :param append: Boolean 是否追加 :return: """ if not self._client.content(hdfs_path, strict=False): print('File Not exist in HDFS') self._client.write(hdfs_path, data, overwrite=overwrite, append=append) def move_or_rename(self, hdfs_src_path, hdfs_dst_path): """ 文件移动/重命名 :param hdfs_src_path: :param hdfs_dst_path: :return: """ self._client.rename(hdfs_src_path, hdfs_dst_path) def hdfs_dir_list(self, hdfs_path): """ 获取指定目录下的文件 当hdfs_path不是目录, 捕获异常并返回None :param hdfs_path: :return: List[filename] or None """ try: return self._client.list(hdfs_path, status=False) except HdfsError: return None
if __name__ == '__main__': hdfs_ip = "192.168.146.133" hdfs_version = 3 hdfs_root = "~/test" filepath = r"C:\Users\daqige\PycharmProjects\newLeetCode\convert.py" hdfs_addr = "http://" + hdfs_ip + ":" + str(9870 if (hdfs_version == 3) else 90070) client = Client(hdfs_addr) # print("创建文件夹") # client.makedirs(hdfs_root) # print(client.list("/")) # # print("上传文件") # client.upload(hdfs_root, filepath) # print(client.list(hdfs_root)) # # print("修改文件名") # client.rename(hdfs_root + "/convert.py", hdfs_root + "/ubuntu.py") # print(client.list(hdfs_root)) print("下载文件") client.download(hdfs_root + "/ubuntu.py", ".") print(os.listdir(".")) print("删除文件") client.delete(hdfs_root + "/ubuntu.py") print(client.list(hdfs_root))
# 返回目标信息 info = client.status(file_name, strict=False) print(info) # 写入文件(覆盖) client.write(file_name, data="hello hdfs !", overwrite=True) # 写入文件(追加) client.write(file_name, data="hello way !", overwrite=False, append=True) # 读取文件内容 with client.read(file_name, encoding='utf-8') as f: print(f.read()) # 文件下载 client.download(file_name, loacl_file_name, overwrite=True) # 文件上传 client.upload(file_name + '111', loacl_file_name, cleanup=True) # 删除文件 client.delete(file_name2) # 文件重命名 client.rename(file_name, file_name2) # 文件夹底下文件 files = client.list(file_dir, status=False) for file in files: print(file)
class RF_HDFS(object): def __init__(self): self.client = None self.directory = None def connect_and_login(self, **kwargs): import requests host = None port = None user = None password = None root = None timeout = None proxy = None if 'host' in kwargs: host = kwargs['host'] if 'port' in kwargs: port = kwargs['port'] if 'kdc' in kwargs: kdc = kwargs['kdc'] if 'user' in kwargs: user = kwargs['user'] if 'password' in kwargs: password = kwargs['password'] if 'root' in kwargs: root = kwargs['root'] if 'proxy' in kwargs: proxy = kwargs['proxy'] if 'timeout' in kwargs: timeout = kwargs['timeout'] self.session = requests.Session() adapter = requests.adapters.HTTPAdapter(pool_maxsize=0) self.session.mount('http://', adapter) self.session.mount('https://', adapter) self.session.headers.update({'Connection':'Keep-Alive'}) self.connectionStatus = False try: timeout = int(timeout) url = "http://" + host + ":" + str(port) hdfsLogin = WebHDFS(url, kdc) cookieStr = hdfsLogin.authenticate(user, password) if cookieStr != None: cookieList = cookieStr.split('=', 1) cookieDict = {cookieList[0]: cookieList[1]} requests.utils.add_dict_to_cookiejar(self.session.cookies, cookieDict) self.client = Client(url, root=root, proxy=proxy, timeout=timeout, session=self.session) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) self.connectionStatus = True return self.client def checkConnectionStatus(self): return self.connectionStatus def list_dir(self, directory): output = [] try: if directory != None: output = self.client.list(directory, status=True) else: output = self.client.list(self.client.root, status=True) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def list_names(self, directory): output = [] try: if directory != None: output = self.client.list(directory, status=False) else: output = self.client.list(self.client.root, status=False) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def upload(self, remote_path, local_path, overwrite=False, permission=None): output = None try: output = self.client.upload(remote_path, local_path, overwrite, permission=permission) except HdfsError as hdfsError: # For some reason this exception includes the entire stack trace after # the error message, so split on '\n' and only return the first line. error = str(hdfsError).splitlines()[0] raise HdfsLibraryError(error) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def download(self, remote_path, local_path, overwrite=False): output = None try: output = self.client.download(remote_path, local_path, overwrite) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def mkdir(self, directory, permission): try: # no return value self.client.makedirs(directory, permission=permission) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def rmdir(self, directory): try: # no return value if self.client.delete(directory, recursive=True) == False: raise HdfsLibraryError("Directory does not exist: %r", directory) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def rename(self, src_file, dst_file): try: # no return value self.client.rename(src_file, dst_file) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def delete(self, file): try: # no return value if self.client.delete(file) == False: raise HdfsLibraryError("File does not exist: %r", file) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_time(self, file, mod_time): try: # no return value self.client.set_times(file, -1, mod_time) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_owner(self, file, owner, group): try: # no return value self.client.set_owner(file, owner=owner, group=group) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_permission(self, file, permission): try: # no return value self.client.set_permission(file, permission=permission) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_acl(self, file, aclspec): try: # no return value self.client.set_acl(file, aclspec=aclspec) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def status(self, path): output = '' try: output = self.client.status(path) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def checksum(self, path): output = '' try: output = self.client.checksum(path) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def close(self): self.session.close()