def __init__(self, config, expiration=30000): def get_token(username, password, expiration): ### # input_type: str, str, int # input: the username of PAI, the password of PAI and the expiration time of the token # output_type: str # output: token # Get the token from rest server API ### token_ready = False loop_count = 0 while not token_ready: time.sleep(loop_count) loop_count += 1 http_object = self.http.request( 'POST', self.rest_server_url + 'token', headers={ 'Content-Type': 'application/x-www-form-urlencoded', }, body='username='******'&password='******'&expiration=' + str(expiration)) if http_object.status == 200: token_ready = True return json.loads( http_object.data.decode('utf-8'))['token'] else: print(http_object.status, http_object.data) self.rest_server_url = config.rest_server_url # rest server url self.http = urllib3.PoolManager() # urllib3 http self.token = get_token(config.PAI_username, config.PAI_password, expiration) # rest Server token self.hdfs_client = Client(config.webhdfs_url) # hdfs web url
def get_data(): client = Client("http://t3.dev:50070", "hadoop") # client = InsecureClient(url="http://t3.dev:50070", user="******", root="/") print(client.list("/huiqu/common/area.txt")) with client.read("/huiqu/common/area.txt/part-00000") as read: # print(read.read().decode('utf8')) return {"data": read.read().decode('utf8')}
def download(hdfs_path, local_path): # 初始化HDFS连接 client = Client('http://fisher.lazybone.xyz:9001', root='/') if os.path.exists(local_path): print('文件已存在') return client.download(hdfs_path=hdfs_path, local_path=local_path)
def generate_job_and_outputtable(schedules_list): job_list = [] outtable_list = [] client = Client("http://emr2-header-1.ipa.aidigger.com:50070", timeout=30) for work_id, schedule_id, cron_type in schedules_list: schedule_url = "https://pony.aidigger.com/api/v1/schedules/{}".format( schedule_id) job_infos = requests.get(schedule_url).json()["data"] time.sleep(1) owner = job_infos["owner"] print("schedule_url: " + schedule_url) for job_info in job_infos["execute_DAG"]: try: if job_info.get("job_info") \ and job_info["job_info"]["configs"].get("command","") \ and job_info["job_info"]["configs"]["command"].startswith("data_pipeline") : #or job_info["job_info"]["configs"]["command"].startswith("data_connector") config = job_info["job_info"]["configs"] if "args" in config.keys(): config['args']["isstreaming"] = str( config['args']["isStreaming"] if "isStreaming" in config['args'].keys( ) else config['args']["isstreaming"]) config['args'].get("spark_conf", {}).get( "dependency", {}).pop("data_pipeline", None) config['args'].pop("KafkaCheckpoint", None) job_list.append( (work_id, config["job_id"], job_info["name"], job_info["job_info"]["configs"]["command"], "1G", "0.3", owner, cron_type)) for output in config["output"]: outtable_list.append(deepcopy(output)) dayu_fullnames = output["dayu_fullname"].split(":") if not dayu_fullnames: raise Exception("error!!") if dayu_fullnames[0].lower() == "hive": dayu_fullnames[1] = "dayu_temp" output["dayu_full_name"] = ":".join( dayu_fullnames) + "_k8spre" elif dayu_fullnames[0].lower().startswith("oss"): output["dayu_full_name"] = output[ "dayu_fullname"][:-1] + "_k8spre/" elif dayu_fullnames[0].lower().startswith("kafka"): output["dayu_full_name"] = output[ "dayu_fullname"] + "_k8spre" output["dayu_full_name"] = output[ "dayu_full_name"].replace(".", "_") else: output["dayu_full_name"] = output[ "dayu_fullname"] + "_k8spre" output.pop("dayu_id") content = json.dumps(config).encode(encoding='utf-8') client.write("/tmp/ting.wu/k8s_press/{}.json".format( config["job_id"]), overwrite=True, data=content) print(" hdfs: /tmp/ting.wu/k8s_press/{}.json".format( config["job_id"])) except Exception as err: print(err) return job_list, outtable_list
def upload(name, file_path, config): env_prefix = config.get("prefix", None) hdfs_client = Client(url=config["hdfs"]["name_node"]) hdfs_hosts = [] hdfs_http_host = config["hdfs"]["name_node"] hdfs_hosts.append(hdfs_http_host.replace("http://", "")) hdfs_data_service_root = "/data_service" if env_prefix is not None: hdfs_data_service_root = "/{0}_data_service".format(env_prefix) hdfs_client.makedirs(hdfs_data_service_root) timestamp = int(round(time.time() * 1000)) target_file_name = "{2}/{0}/{1}/{0}_{1}.py".format( name, str(timestamp), hdfs_data_service_root) hdfs_client.makedirs("{2}/{0}/{1}".format(name, str(timestamp), hdfs_data_service_root)) print("hdfs file name: {0}".format(target_file_name)) hdfs_client.upload(target_file_name, file_path) zip_path = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "joowing.zip") target_zp_file_name = "{2}/{0}/{1}/joowing.zip".format( name, str(timestamp), hdfs_data_service_root) # hdfs_client.upload(target_zp_file_name, zip_path) # return target_file_name, target_zp_file_name return target_file_name
def reflect(self, reload_flag = False): if not self.ds_dict: si_app.logger.error('error: ds_dict must be provided.') return if reload_flag: si_app.delete_doc_from_index_by_datasource(ds_name=self.ds_dict['ds_name']) tclient = Client( self.ds_dict['ds_param']['hdfs_web_url']) # self.ds_dict['ds_url']) path_hdfs = self.ds_dict['ds_param']['root_path'] # self.ds_dict['table_group_name'] if path_hdfs[-1] == '/': # remove trailing '/' for concatenating more path path_hdfs = path_hdfs[:-1] # remove the trailing '/' filelist = self.getHDFSFileInfo(tclient, path_hdfs) for fd in filelist: si_app.add_table_content_index(ds_name = self.ds_dict['ds_name'], table_id=fd['table_name'], table_info=(json.dumps(fd) ), table_content_index = ' '.join([fd[k] for k in fd.keys() ]) )
def __init__(self, spark, config, generator): super(DataContext, self).__init__() self.spark = spark self.config = config self.generator = generator self.env_prefix = config.get("prefix", None) self.hdfs_client = Client(url=";".join(config["hdfs"]["name_node"]), proxy="joowing")
def main(): client = Client("http://127.0.0.1:50070", root="/", timeout=100, session=False) #client.makedirs("/news") client.upload("/input", "x.html") print(client.list("/"))
def from_settings(cls, settings): hdfs_master = settings['HDFS_MASTER'] hdfs_address = settings['HDFS_ADDRESS'] try: client = Client('http://' + str(hdfs_master) + ':' + str(hdfs_address)) except Exception as e: print(e) return cls(client)
def get(self, request): client = Client(HDFS_HOST) hdfs = request.GET.get('hdfs') file_name = DataSource.objects.get(format_filename=hdfs).file_name client.download('/datahoop/' + hdfs, settings.MEDIA_ROOT + 'hdfs_download') path = os.path.join(settings.MEDIA_ROOT, 'hdfs_download') file = open(os.path.join(path, hdfs), 'rb') response = FileResponse(file) response = HttpResponse(content_type='application/vnd.ms-csv') response['Content-Disposition'] = 'attachment; filename={}.csv'.format(file_name.split('.')[0]) return (response)
def __init__(self, **kwargs): self.table_cols_map = {} # 表字段顺序 {table:(cols, col_default)} self.bizdate = bizdate # 业务日期为启动爬虫的日期 self.buckets_map = {} # 桶 {table:items} self.bucketsize = kwargs.get('BUCKETSIZE') self.client = Client(kwargs.get('HDFS_URLS')) self.dir = kwargs.get('HDFS_FOLDER') # 文件夹路径 self.delimiter = kwargs.get('HDFS_DELIMITER') # 列分隔符,默认 hive默认分隔符 self.encoding = kwargs.get('HDFS_ENCODING') # 文件编码,默认 'utf-8' self.hive_host = kwargs.get('HIVE_HOST') self.hive_port = kwargs.get('HIVE_PORT') self.hive_dbname = kwargs.get('HIVE_DBNAME') # 数据库名称 self.hive_auto_create = kwargs.get('HIVE_AUTO_CREATE', False) # hive 是否自动建表,默认 False self.client.makedirs(self.dir)
def get(self, request, *args, **kwargs): table_name = request.data.get('name') username = request.session['username'] password = request.session['password'] host = request.session['host'] port = request.session['port'] database_name = request.session['dbdatabase_name'] obj = DataSource.objects conn = pymssql.connect(database=database_name, user=username, password=password, host=host, port=port) client = Client(HDFS_HOST) cur = conn.cursor() for i in table_name: global rels cur.execute("select name from syscolumns where id = object_id('%s');" % (i)) rels = [] rel = [] rows = cur.fetchall() for i in rows: for item in i: rel.append(item) rels.append(rel) # 类似于其他语言的 query 函数, execute 是 python 中的执行查询函数 cur.execute("SELECT * FROM %s" % (i)) # 使用 fetchall 函数,将结果集(多维元组)存入 rows 里面 rows1 = cur.fetchall() # 依次遍历结果集,发现每个元素,就是表中的一条记录,用一个元组来显示 for row in rows1: rels.append(list(row)) file_name = i + '.sql' format_name = uuid.uuid1() filepath = settings.MEDIA_ROOT + format_name with open(filepath, 'wb+') as writer: for chunk in rels: writer.write(chunk) client.upload("/datahoop", filepath) obj.create(file_name=file_name, format_name=format_name, user_id=1) os.remove(filepath) client.close() cur.close() conn.close() return HttpResponse(json.dumps(rels), content_type='application/json')
def delete(self, request, *args, **kwargs): file_id = request.data.get('file_id') where = DataSource.objects.get(id=file_id).where if where == 'hdfs': file = DataSource.objects.get(id=file_id) hdfs_name = DataSource.objects.get(id=file_id).format_filename client = Client(HDFS_HOST) client.delete('/datahoop/' + hdfs_name, recursive=True) file.delete() else: client = pymongo.MongoClient(settings.MONGO_DB_HOST, settings.MONGO_DB_PORT) db = client.datahoop.data file_id = DataSource.objects.filter(id=id).first() obj_id = file_id.obj_id file_id.delete() db.remove({"_id": ObjectId(obj_id)}) client.close() return HttpResponse(content_type='application/json')
def __init__(self, config: dict = None, file: str = 'openpai.json'): """config should contain - rest_server_socket - hdfs_web_socket - user - password """ if config is None: with open(file) as fn: config = json.load(fn) for key in [ 'rest_server_socket', 'hdfs_web_socket', 'user', 'password' ]: assert key in config, '%s is not defined for OpenPAI' % (key) for key in ['rest_server_socket', 'hdfs_web_socket']: assert config[key].startswith( 'http://'), '%s should have http prefix' % (key) self.rest_server_socket = config['rest_server_socket'] self.hdfs_client = Client(config['hdfs_web_socket']) self.config = config
def get(self, request, *args, **kwargs): # table_name = request.data.get('name') table_name = 'files_datasource' username = request.session['username'] password = request.session['password'] host = request.session['host'] port = request.session['port'] database_name = request.session['database_name'] obj = DataSource.objects con = pymysql.connect(host, username, password, database_name) client = Client(HDFS_HOST) cur = con.cursor() # for i in table_name: sql = "select DISTINCT (COLUMN_NAME) from information_schema.COLUMNS where table_name = '%s'" cur.execute(sql % (table_name)) rows = cur.fetchall() rels = [] rel = [] for i in rows: rel.append(i[0]) rels.append(rel) # 类似于其他语言的 query 函数, execute 是 python 中的执行查询函数 cur.execute("SELECT * FROM %s" % (table_name)) # 使用 fetchall 函数,将结果集(多维元组)存入 rows 里面 rows = cur.fetchall() # 依次遍历结果集,发现每个元素,就是表中的一条记录,用一个元组来显示 for row in rows: rels.append(list(row)) file_name = table_name + '.sql' format_name = uuid.uuid1() filepath = settings.MEDIA_ROOT + str(format_name) with open(filepath, 'wb+') as writer: for chunk in rels: writer.write(chunk) client.upload("/datahoop", filepath) obj.create(file_name=file_name, format_name=format_name, user_id=1) os.remove(filepath) client.close() con.close() cur.close() return HttpResponse(json.dumps(rels), content_type='application/json')
def get(self, request): # delete mydata file_id = request.GET.get('file_id') try: where = DataSource.objects.get(id=file_id).where print(DataSource.objects.get(id=file_id)) print(where) format_filename = DataSource.objects.get( id=file_id).format_filename format_name_count = DataSource.objects.filter( format_filename=format_filename).count() if where == 'hdfs' and format_name_count == 1: file = DataSource.objects.get(id=file_id) hdfs_name = DataSource.objects.get(id=file_id).format_filename client = Client(HDFS_HOST) client.delete('/datahoop/' + hdfs_name, recursive=True) file.delete() item = Collect.objects.filter(file_id=file_id) if item: item.delete() elif where == 'hdfs' and format_name_count > 1: file = DataSource.objects.get(id=file_id) file.delete() item = Collect.objects.filter(file_id=file_id) if item: item.delete() else: client = pymongo.MongoClient(settings.MONGO_DB_URI) db = client.datahoop.data data_obj = DataSource.objects.filter(id=file_id).first() obj_id = data_obj.obj_id data_obj.delete() db.remove({"_id": ObjectId(obj_id)}) client.close() item = Collect.objects.filter(file_id=file_id) if item: item.delete() return JsonResponse({'status': True}) except: return JsonResponse({'status': False})
def __init__(self, config, expiration=30000): def get_token(username, password, expiration): ### # input_type: str, str, int # input: the username of PAI, the password of PAI and the expiration time of the token # output_type: str # output: token # Get the token from rest server API ### rest_server_url_without_namespace = '/'.join( self.rest_server_url.split('/')[:-3]) + '/' token_ready = False loop_count = 0 while not token_ready: time.sleep(loop_count) loop_count += 1 http_object = self.http.request( 'POST', rest_server_url_without_namespace + 'token', headers={ 'Content-Type': 'application/json', }, body=json.dumps({ 'username': username, 'password': password, 'expiration': str(expiration) })) if http_object.status == 200: token_ready = True return json.loads( http_object.data.decode('utf-8'))['token'] else: print(http_object.status, http_object.data) self.rest_server_url = config.rest_server_url # rest server url self.http = urllib3.PoolManager() # urllib3 http self.token = get_token(config.PAI_username, config.PAI_password, expiration) # rest Server token self.hdfs_client = Client(config.webhdfs_url) # hdfs web url
def conn(self): client = Client('http://192.168.0.107:11070') return client
def build_connection(self): self.client = Client(self.hadoop_url)
rowkey = md5(str(user_id) + str(visitTime)) print(rowkey) mutations = [Mutation(column=self.columnFamily + ":ipaddr", value=ipaddr), \ Mutation(column=self.columnFamily + ":visitTime", value=visitTime), \ Mutation(column=self.columnFamily + ":user_id", value=user_id), \ Mutation(column=self.columnFamily + ":link", value=link) ] self.client.mutateRow(self.tablename, rowkey, mutations) if __name__ == "__main__": # 建立hbase连接 hbasewriteer = CreateTableAndImportData('user_log_info', 'cf_log') hbasewriteer.createTable() # 连接HDFS client = Client(HDFSNN, timeout=200000) # 获取文件列表 logFiles = client.list(LOGPATH) # 读取文件 for logfile in logFiles: with client.read(LOGPATH + logfile) as fp: for line in fp: record = line.split(" ") hbasewriteer.importData(record)
def __init__(self): self.client = Client("http://fantome:50070")
def get_client(self): client = Client(self.url) return client
def build_hdfs_client(self) -> Client: return Client(url=";".join(self.config["hdfs"]["name_node"]), proxy='joowing')
def __init__(self, config): super(NormalContext, self).__init__() self.config = config self.hdfs_client = Client(url=";".join(config["hdfs"]["name_node"]), proxy='joowing') self.env_prefix = config.get("prefix", None)
from hdfs import Client client = Client("http://master:9870") #client.makedirs("/abc/xyz") x = client.list("/") y = client.list("/", status=True)
def train(train_path, test_path, output_path, target, train_split_ratio=0.33, penalty='l2', dual=False, tol=1e-4, C=1.0, random_state=None, multi_class='ovr'): # 设置起始时间 time.localtime() time_trains_start = time.strftime('%Y{y}%m{m}%d{d} %H{h}%M{f}%S{s}'.format( y='/', m='/', d='', h=':', f=':', s='')) start_time = time.time() # 设置输入文件路径 train_FILENAME = train_path + "/data/Data.csv" # hdfs文件路径 test_FILENAME = test_path + "/data/Data.csv" # hdfs文件路径 client = Client(HDFS_HOSTS1) # 训练数据读取 with client.read(train_FILENAME) as tr_s: tr_content = tr_s.read() tr_s = str(tr_content, 'utf-8') # 确保文件写入完毕 tr_file = open("trainData.csv", "w") tr_file.flush() os.fsync(tr_file) tr_file.write(tr_s) tr_file.close() df_train = pd.read_csv("trainData.csv", header=0) print(df_train) # 测试数据读取 with client.read(test_FILENAME) as te_fs: te_content = te_fs.read() te_s = str(te_content, 'utf-8') # 确保文件写入完毕 te_file = open("testData.csv", "w") te_file.flush() os.fsync(te_file) te_file.write(te_s) te_file.close() df_test = pd.read_csv("testData.csv", header=0) print(df_test) min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) test_data_num = df_train.shape[0] train_data_num = df_train.shape[0] # 处理预测集 df_test = min_max_scaler.fit_transform(df_test) df_test = np.array(df_test) # 数据处理和清洗 cols = [tmp_i for tmp_i in df_train.columns if tmp_i not in [target]] X = df_train[cols] X = np.array(X) X = min_max_scaler.fit_transform(X) Y = df_train[target] Y = np.array(Y) # 训练集数据分割 X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=train_split_ratio) # 使用 scikit learn 中的LR模型进行训练 clf = LogisticRegression(penalty, dual, tol, C, random_state, multi_class, solver='liblinear') clf.fit(X_train, Y_train) # 准确率train_acc train_acc = clf.score(X_test, Y_test) print('score Scikit learn: ', train_acc) # 精确率train_precision_score train_precision_score = precision_score(Y_test, clf.predict(X_test)) # 召回率train_recall_score train_recall_score = recall_score(Y_test, clf.predict(X_test)) # F1_Score train_f1_score = f1_score(Y_test, clf.predict(X_test)) # roc_auc_score train_roc_auc_score1 = roc_auc_score(Y_test, clf.predict(X_test)) # 使用 scikit learn 中的LR模型进行预测 result = clf.predict(df_test) # print(result) # 设置终止时间,并计算总时间 train_end = time.time() train_seconds = train_end - start_time m, s = divmod(train_seconds, 60) h, m = divmod(m, 60) time_trains_all = "%02d:%02d:%02d" % (h, m, s) # ++++++++++++++++++++++++++++++++++++++++训练结果保存+++++++++++++++++++++++++++++++++++++++# ## 保存摘要模型报告文件 # abstract_path = HDFS_HOSTS1 + output_path + '/abstract/data/' abstract_path = output_path + '/abstract/data/' f = open('abstract.csv', mode='w', newline='') fileheader = [ 'FrameWork', 'Version', 'model', 'accuracy', 'time_trains_start', 'time_trains_all', 'test_data_num', 'train_data_num' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.FrameWork = 'Scikit-learn' csv_dict.Version = sklearn.__version__ csv_dict.model = '%s' % LogisticRegression csv_dict.accuracy = str(train_acc) csv_dict.time_trains_start = time_trains_start csv_dict.time_trains_all = time_trains_all csv_dict.test_data_num = str(test_data_num) csv_dict.train_data_num = str(train_data_num) w.writerow(csv_dict) f.close() client.delete(abstract_path + 'abstract.csv') client.upload(abstract_path + 'abstract.csv', 'abstract.csv') # if len(client.list(abstract_path)): # client.delete(abstract_path + 'abstract.csv') # client.upload(abstract_path + 'abstract.csv', 'abstract.csv') # else: # client.upload(abstract_path + 'abstract.csv', 'abstract.csv') ##保存模型版本信息csv文件 version_path = output_path + '/msg/data/' f = open('msg.csv', mode='w', newline='') fileheader = [ 'accuracy', 'time_trains_start', 'time_trains_all', 'test_data_num', 'train_data_num' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.accuracy = str(train_acc) csv_dict.time_trains_start = time_trains_start csv_dict.time_trains_all = time_trains_all csv_dict.test_data_num = str(test_data_num) csv_dict.train_data_num = str(train_data_num) w.writerow(csv_dict) f.close() client.delete(version_path + 'msg.csv') client.upload(version_path + 'msg.csv', 'msg.csv') ## 保存训练评价指标模型报告文件 file_csv_path = output_path + '/evaluation/data/' f = open('evaluation.csv', mode='w', newline='') fileheader = [ 'accuracy', 'train_precision_score', 'train_recall_score', 'train_f1_score', 'train_roc_auc_score1' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.accuracy = str(train_acc) csv_dict.train_precision_score = train_precision_score csv_dict.train_recall_score = train_recall_score csv_dict.train_f1_score = train_f1_score csv_dict.train_roc_auc_score1 = train_roc_auc_score1 w.writerow(csv_dict) f.close() client.delete(file_csv_path + 'evaluation.csv') client.upload(file_csv_path + 'evaluation.csv', 'evaluation.csv') # 保存测试集预测结果文件 file_csv_path = output_path + '/result/data/' # 字典中的key值即为csv中列名 dataframe = pd.DataFrame({target: result}) # 将DataFrame存储为csv,index表示是否显示行名,default=True dataframe.to_csv("result.csv", index=False, sep=',') client.delete(file_csv_path + 'result.csv') client.upload(file_csv_path + 'result.csv', 'result.csv')
print(rowkey) mutations = [Mutation(column=self.columnFamily + ":ipaddr", value=ipaddr), \ Mutation(column=self.columnFamily + ":visitTime", value=visitTime), \ Mutation(column=self.columnFamily + ":user_id", value=user_id), \ Mutation(column=self.columnFamily + ":link", value=link) ] # 一次提交多行 mutations_batch.append( BatchMutation(row=rowkey, mutations=mutations)) if len(mutations_batch) % batch_size == 0: self.client.mutateRows(self.tablename, mutations_batch) mutations_batch = [] if __name__ == "__main__": # 建立hbase连接 hbasewriteer = CreateTableAndImportData('user_log_info', 'cf_log') hbasewriteer.createTable() # 连接HDFS client = Client(HDFSNN) # 获取文件列表 logFiles = client.list(LOGPATH) # 读取文件 for logfile in logFiles: with client.read(os.path.join(LOGPATH, logfile)) as deal_file_handle: hbasewriteer.importData(deal_file_handle)
import pandas as pd import os from hdfs import Client # 目前读取hdfs文件采用方式: # 1. 先从hdfs读取二进制数据流文件 # 2. 将二进制文件另存为.csv # 3. 使用pandas读取csv文件 HDFSHOST = "http://172.16.18.112:50070" train_path = "/9a7e7ef5a78f4f8388deff28cc5c2115/dataSet/devdatasetdef19761" test_path = "/9a7e7ef5a78f4f8388deff28cc5c2115/dataSet/devdatasetdef19762" train_FILENAME = train_path + "/data/Data.csv" #hdfs文件路径 test_FILENAME = test_path + "/data/Data.csv" #hdfs文件路径 client = Client(HDFSHOST) with client.read(train_FILENAME) as tr_s: tr_content = tr_s.read() tr_s = str(tr_content, 'utf-8') # 确保文件写入完毕 tr_file = open("trainData.csv", "w") tr_file.flush() os.fsync(tr_file) tr_file.write(tr_s) tr_file.close() # 读取文件 df_train = pd.read_csv("trainData.csv", header=0) print(df_train) with client.read(test_FILENAME) as te_fs: te_content = te_fs.read() te_s = str(te_content, 'utf-8')
def __init__(self, host, port=50070): self.url = "http://%s:%d" % (host, port) self.client = Client(url=self.url)
def __init__(self, url): self._client = Client(url)