def get_hdfs_client(env="local"): master, slave = get_env(env) try: client = Client(master) client.list("/") except HdfsError: client = Client(slave) return client
def get_rtree(date, hdfs_rtree_path="/user/hadoop/rtree/shanghai/%s-%s/%s.%s", local_path="/home/hadoop/data/downloaded/rtree/%s-%s/"): year, month, day = date.split('-') # guarantee there exists local directory for the worker nodes path = local_path % (year, month) if not os.path.exists(path): os.makedirs(path) client = Client(QuerierParallel.master_hdfs_path, root="/", timeout=100, session=False) client.download(hdfs_rtree_path % (year, month, day, "data"), path, overwrite=True) client.download(hdfs_rtree_path % (year, month, day, "index"), path, overwrite=True) # contruct the rtree rtree_properties = index.Property() rtree_properties.dat_extension = 'data' rtree_properties.idx_extension = 'index' rtree = index.Index(path + day, properties=rtree_properties) return rtree
def generate_temp_files(need_certificate=NEED_CERTIFICATE): if need_certificate: with krbcontext(using_keytab=True, keytab_file=KEYTAB_PATH, principal=PRINCIPAL): for node in HDFS.NODES: try: hdfs_client = KerberosClient(node) hdfs_client.download(HDFS.REMOTE_PATH, HDFS.LOCAL_PATH, n_threads=HDFS.THREAD_NUM) except Exception as err: logging.info(err) else: return logging.error("Failed to download remote HDFS file.") raise Exception("Failed to download remote HDFS file.") else: for node in HDFS.NODES: try: hdfs_client = Client(node) hdfs_client.download(HDFS.REMOTE_PATH, HDFS.LOCAL_PATH, n_threads=HDFS.THREAD_NUM) except Exception as err: logging.info(err) else: return logging.error("Failed to download remote HDFS file.") raise Exception("Failed to download remote HDFS file.")
def get(self, request): _hdfsName = request.GET.get("hdfsName", "46eccfa2-1c56-11e8-a752-1008b1983d21") _hdfsPath = os.path.join("/datahoop/", _hdfsName) # print(_hdfsPath) try: # 链接HDFS,读取文件 cli = Client(settings.HDFS_HOST) fileName = cli.list(_hdfsPath)[1] # print("filename:", fileName) _hdfsPath = os.path.join(_hdfsPath + "/", fileName) # print(_hdfsPath) try: with cli.read(_hdfsPath, length=2000, encoding="gbk") as f: datas = f.read() except UnicodeDecodeError: with cli.read(_hdfsPath, length=2000, encoding="utf8") as f: datas = f.read() # 字符转list re.sub("\r\n", "\n", datas) logger.debug(datas) datas = datas.strip('"').split('\n') content = [] for i in datas: content.append(i.strip('"').split(",")) except HdfsError: return Response(data={"error": "文件未找到或文件编码格式不符合"}, status=status.HTTP_400_BAD_REQUEST) return Response(data={"data": content}, status=status.HTTP_200_OK)
def put_to_hdfs(result_file): client = Client("http://192.168.53.30:50070") if client.status('/tmp/result.csv', strict=False): client.delete('/tmp/result.csv') client.upload('/tmp', result_file) else: client.upload('/tmp', result_file)
def get(self, request): _hdfsName = request.GET.get("hdfsName", "46eccfa2-1c56-11e8-a752-1008b1983d21") _hdfsPath = os.path.join("/datahoop/", _hdfsName) obj = DataSource.objects.get(format_filename=_hdfsName) # print(_hdfsPath) try: # 链接HDFS,读取文件 cli = Client(settings.HDFS_HOST) try: with cli.read(_hdfsPath, encoding="gbk") as f: datas = f.read() except UnicodeDecodeError: with cli.read(_hdfsPath, encoding="utf8") as f: datas = f.read() except HdfsError: return Response(data={"error": "文件未找到或文件编码格式不符合"}, status=status.HTTP_400_BAD_REQUEST) response = HttpResponse(content_type='csv/plain') response['Content-Disposition'] = 'attachment; filename={0}'.format( obj.file_name) response.write(datas) return response
def download_parquet_from_hdfs_dir(parquet_dir, local_dir, hdfs_ip, hdfs_port=50070): """ 从hdfs批量下载parquet文件到local_path :param parquet_dir: parquet文件所在的文件'/data/a.parquet' :param local_path: '/data_gen/b.parquet' :param hdfs_ip: :param hdfs_port: :return: """ import os from hdfs.client import Client client = Client(f'http://{hdfs_ip}:{hdfs_port}') parquet_list = client.list(parquet_dir) print(parquet_list) for p in parquet_list: if p.endswith('.parquet'): print(f'downloading {os.path.join(parquet_dir, p)}') with client.read(os.path.join(parquet_dir, p)) as reader: data = reader.read() if not os.path.exists(local_dir): os.makedirs(local_dir) with open(os.path.join(local_dir, p), 'wb') as f: f.write(data)
def run_hive(conf: ConfigData, the_date: str): a_client = Client(conf.hdfs_ip()) # "http://10.2.201.197:50070" conn = connect(host=conf.hive_ip(), port=conf.hive_port(), auth_mechanism=conf.hive_auth(), user=conf.hive_user()) cur = conn.cursor() print("Start\n") the_date = StrTool.get_the_date_str(the_date) # "20181101" # hdfs_dir_bl root_path = str( pathlib.PurePosixPath(conf.get_hdfs_path()).joinpath(the_date)) file_name = str( pathlib.PurePosixPath(root_path).joinpath( conf.get_file_name(the_date))) # "/data/posflow/allinpay_utf8_zc/20181101/" # 20181101_loginfo_rsp_bl_new.csv # 20181101_rsp_agt_bl_new.del # 20181101_rxinfo_rsp_bl.txt table_name = conf.get_table_name() if MyHdfsFile.isfile(a_client, file_name): sql = 'LOAD DATA INPATH \'' + file_name + '\' INTO TABLE ' + table_name # 'test.t1_trxrecprd_v2_zc' # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2' print("OK" + " " + sql + "\n") cur.execute(sql) # , async=True) cur.close() conn.close()
def run_hive(conf: ConfigData, the_date: str): client = Client(conf.hdfs_ip()) # "http://10.2.201.197:50070" conn = connect(host=conf.hive_ip(), port=conf.hive_port(), auth_mechanism=conf.hive_auth(), user=conf.hive_user()) cur = conn.cursor() the_date = StrTool.get_the_date_str(the_date) # "20181101" root_path = conf.get_data("hdfs_dir_zc") # "/data/posflow/allinpay_utf8_zc/" file_ext3 = conf.get_data("file_ext3") # _loginfo_rsp.txt # 20181101_loginfo_rsp.txt file_ext4 = conf.get_data("file_ext4") # _loginfo_rsp_agt.txt # 20181101_loginfo_rsp_agt.txt file_ext5 = conf.get_data("file_ext5") # _rxinfo_rsp.txt # 20181101_rxinfo_rsp.txt file_ext6 = conf.get_data("file_ext6") # _rxinfo_rsp_agt.txt # 20181101_rxinfo_rsp_agt.txt print("Start\n") file3 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext3)) file4 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext4)) file5 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext5)) file6 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext6)) f_list = [file3,file4,file5,file6] t_list = ["hive_table3", "hive_table4", "hive_table5", "hive_table6"] for n in range(0,4): if MyHdfsFile.isfile(client, f_list[n]): sql = 'LOAD DATA INPATH \'' + f_list[n] + '\' INTO TABLE ' + conf.get_data(t_list[n]) # 'test.t1_trxrecprd_v2_zc' # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2' print("OK" + " " + sql+"\n") cur.execute(sql) # , async=True) cur.close() conn.close()
def get_client(host, use_kerberos): if use_kerberos: from hdfs.ext.kerberos import KerberosClient return KerberosClient(host) else: from hdfs.client import Client return Client(host)
def get_data(file_path): HDFSUrl = "http://192.168.0.201:50070" client = Client(HDFSUrl, root='/') with client.read(file_path, buffer_size=1024, delimiter='\n', encoding='utf-8') as reader: data = [line.strip().split() for line in reader] print("data",data[0:2]) return data
def HDFS_cd(self, hdfs_path): """ 切换当前目录, 其实就是重新连接了 """ m_NewDirectory = Path(os.path.join(self.__m_HDFS_WebFSDir__, hdfs_path)).as_posix() self.__m_HDFS_WebFSDir__ = m_NewDirectory self.__m_HDFS_Handler__ = Client(self.__m_HDFS_WebFSURL__, self.__m_HDFS_WebFSDir__, session=None)
def connect(self, host, port): conn_url = "http://{}:{}".format(host, port) try: self.client = Client(conn_url) return True except Exception, e: print "Connect Failed:{}-{}".format(Exception, e) return False
def _get_client(self, addr, port): if not CACHE or not self._clients.has_key(addr): cli = Client('http://%s:%s' % (str(addr), str(port))) if CACHE: self._clients.update({addr: cli}) else: cli = self._clients.get(addr) return cli
def make_directory(hdfs_address, directory_path, directory_name): ''' Description: This function helps users to create a directory in hdfs Parameters: -hdfs_address: hadoop master node ip address -directory_path: the path the user want to create a directory -directory_name: the directory name Returns: None ''' client = Client('http://' + hdfs_address) client.makedirs(directory_path + directory_name)
def dataframe_write_to_hdfs(hdfs_path, dataframe): """ :param client: :param hdfs_path: :param dataframe: :return: """ HDFSUrl = "http://192.168.0.201:50070" client = Client(HDFSUrl, root='/') client.write(hdfs_path, dataframe.to_csv(header=False,index=False,sep="\t"), encoding='utf-8',overwrite=True)
def test_hdfs_files(): project = utils.get_test_project() head_ip = project.cluster.head.ip hdfs = Client('http://%s:50070' % head_ip) root_dirs = hdfs.list('/') assert 'spark' in root_dirs spark_dirs = hdfs.list('/spark') assert 'spark-1.4.1-bin-hadoop2.6.tgz' in spark_dirs
def get_hadoop_connection(cls, host): try: client = Client(host, root='/', timeout=10000) client.list('/') except Exception as e: try: log_handler.log.info('get query data error from hadoop 01 -----{}'.format(e)) host = host.replace('01', '02') client = Client(host, root='/', timeout=10000) client.list('/') except Exception as e: try: log_handler.log.info('get query data error from hadoop 02 -----{}'.format(e)) host = host.replace('02', '03') client = Client(host, root='/', timeout=10000) client.list('/') except Exception as e: client = None log_handler.log.info('get query data error from hadoop -----{}'.format(e)) return client
def test_hdfs_dirs(): project = utils.get_test_project() head_ip = project.cluster.head.ip hdfs = Client('http://%s:50070' % head_ip) users_dirs = hdfs.list('/user') assert 'hive' in users_dirs assert 'impala' in users_dirs users_dirs = hdfs.list('/user/hive') assert 'warehouse' in users_dirs
def save_page_hdfs(ipPort, file_path, contents): """保存网页源码到hdfs :param ipPort: hdfs连接地址 :param file_path: 文件路径 :param contents: 网页内容 :return: None """ client = Client(ipPort) with client.write(file_path) as writer: writer.write(bytes(contents, encoding='utf8'))
def get_data_hdfs(file_path): HDFSUrl = "http://192.168.0.201:50070" client = Client(HDFSUrl, root='/') with client.read(file_path, buffer_size=1024, delimiter='\n', encoding='utf-8') as reader: data = [line.strip().split(',') for line in reader] print("data", data[0:5]) df = pd.DataFrame(data[1:], columns=data[0]) return df
def hdfs_file2points(path): client = Client(QuerierParallel.master_hdfs_path, root="/", timeout=100, session=False) points = [] with client.read(path) as f: for line in f: info = line.strip('\n').split('\t') points.append([float(info[0]), float(info[1])]) f.close() return points
def run_hdfs_test(conf: ConfigData): # the_date = conf.test_date() # "20181101" client = Client(conf.hdfs_ip()) # "http://10.2.201.197:50070" # root_path = conf.unzip_dir(is_baoli) # 'D:/DATA/UNZIP/' # dest_dir = conf.hdfs_dir_syb(is_baoli) # file_pre = conf.file_pre1() # "t1_trxrecord_" # file_ext = conf.file_ext2() # "_V2.csv" # client.upload('/shouyinbao/', "/home/testFolder/logflow/bl_shouyinbao/UTF8/20181101/9999100000/t1_trxrecord_20181101_V2.csv", cleanup=True) dat = client.list('/', status=False) print(dat)
def read_accesslog_from_hdfs(self): # 实时日志流的存储是每5个点击数据存储一次 client = Client("http://localhost:50070") file_names = client.list("/hadoop_file") ss = "" for file_name in file_names: with client.read("/hadoop_file/" + file_name, encoding="utf-8") as reader: for line in reader: # 去除测试数据 if line.startswith("filed1"): continue ss += line
def do(): global csv_path client = Client(hdfshost) file_list = client.list(csv_path) print(file_list) for file in file_list: if file.endswith(".csv"): csv_path = csv_path + file # 读取csv并同名写到本地 with open("./异常临界值local.csv", 'w', encoding='GB2312') as local: with client.read(csv_path, encoding='GB2312') as hdfs: for line in hdfs: local.write(line.strip('\n'))
def test_hdfs_dirs(): project = utils.get_test_project() nn_ip = project.cluster.head.ip hdfs = Client('http://%s:50070' % nn_ip) assert hdfs root_dirs = hdfs.list('/') assert 'tmp' in root_dirs assert 'user' in root_dirs users_dirs = hdfs.list('/user') assert project.settings['USERNAME'] in users_dirs
def generate_files( date, path="user/hadoop/trajectory/sim_trajectory_per_day/shanghai/%s-%s/%s", ): year, month, day = date.split('-') if date in QuerierParallel.files: return else: client = Client(QuerierParallel.master_hdfs_path, root="/", timeout=100, session=False) QuerierParallel.files.update( {date: client.list(path % (year, month, day))})
def read(dir_path, header): client = Client("http://127.0.0.1:50070") log_data = [] for date_dir in client.list(dir_path): for log_file in client.list(dir_path+'/'+date_dir): with client.read(dir_path+'/'+date_dir+'/'+log_file) as fs: for line in fs: row = line.strip().split('&') if row != ['']: tmp = [] for field in row: tmp.append(field.split('=')[1]) log_data.append(tmp) return pd.DataFrame(log_data, columns=header)
def read_corpus(): qList = [] # 问题的关键词列表 qList_kw = [] aList = [] lines = [] client = Client("http://localhost:50070") with client.read("/corpus/q_a.csv", encoding='utf-8') as reader: for line in reader: lines.append(line.strip()) for t in lines: qList.append(t[0]) qList_kw.append(seg.cut(t[0])) aList.append(t[1]) return qList_kw, qList, aList
def hdfs_file_compressor(file_path, local_path, model_id, dir_path): try: file_path_dict = file_path.split('/', 3)[-1] # Python连接hdfs客户端 client = Client(hdfs_url, root="/", timeout=10000, session=False) # 获取远程hdfs文件 get_from_hdfs(client=client, hdfs_path=file_path_dict, local_path=local_path) file_list = file_name(local_path) for file in file_list: separated_file_name = file.split('/') if separated_file_name[-2] == 'model_file%s' % model_id: continue file_name_local = separated_file_name[-1] local_path_file = local_path + "/" + file_name_local model = compress(load_model(file), acceptable_error=0.001) # 压缩.h5文件 model.save(local_path_file) # 输出压缩文件 local_path_folder = local_path + "/" + separated_file_name[-2] shutil.rmtree(local_path_folder) # 上传压缩文件 put_to_hdfs(client=client, local_path=local_path, hdfs_path=file_path_dict) compress_file_path = file_path + '/model_file%s' % model_id sql = update_sql % (compress_file_path, model_id) compress_file_path_update = mysql.query(dir_path, sql, work_path='') # 上传数据库 except Exception as e: print('output->failed', flush=True) print(e) # 删除文件 shutil.rmtree(local_path) print('output->success', flush=True)