def delete(self, request, *args, **kwargs): file_id = request.data.get('file_id') where = DataSource.objects.get(id=file_id).where if where == 'hdfs': file = DataSource.objects.get(id=file_id) hdfs_name = DataSource.objects.get(id=file_id).format_filename client = Client(HDFS_HOST) client.delete('/datahoop/' + hdfs_name, recursive=True) file.delete() else: client = pymongo.MongoClient(settings.MONGO_DB_HOST, settings.MONGO_DB_PORT) db = client.datahoop.data file_id = DataSource.objects.filter(id=id).first() obj_id = file_id.obj_id file_id.delete() db.remove({"_id": ObjectId(obj_id)}) client.close() return HttpResponse(content_type='application/json')
def get(self, request): # delete mydata file_id = request.GET.get('file_id') try: where = DataSource.objects.get(id=file_id).where print(DataSource.objects.get(id=file_id)) print(where) format_filename = DataSource.objects.get( id=file_id).format_filename format_name_count = DataSource.objects.filter( format_filename=format_filename).count() if where == 'hdfs' and format_name_count == 1: file = DataSource.objects.get(id=file_id) hdfs_name = DataSource.objects.get(id=file_id).format_filename client = Client(HDFS_HOST) client.delete('/datahoop/' + hdfs_name, recursive=True) file.delete() item = Collect.objects.filter(file_id=file_id) if item: item.delete() elif where == 'hdfs' and format_name_count > 1: file = DataSource.objects.get(id=file_id) file.delete() item = Collect.objects.filter(file_id=file_id) if item: item.delete() else: client = pymongo.MongoClient(settings.MONGO_DB_URI) db = client.datahoop.data data_obj = DataSource.objects.filter(id=file_id).first() obj_id = data_obj.obj_id data_obj.delete() db.remove({"_id": ObjectId(obj_id)}) client.close() item = Collect.objects.filter(file_id=file_id) if item: item.delete() return JsonResponse({'status': True}) except: return JsonResponse({'status': False})
def train(train_path, test_path, output_path, target, train_split_ratio=0.33, penalty='l2', dual=False, tol=1e-4, C=1.0, random_state=None, multi_class='ovr'): # 设置起始时间 time.localtime() time_trains_start = time.strftime('%Y{y}%m{m}%d{d} %H{h}%M{f}%S{s}'.format( y='/', m='/', d='', h=':', f=':', s='')) start_time = time.time() # 设置输入文件路径 train_FILENAME = train_path + "/data/Data.csv" # hdfs文件路径 test_FILENAME = test_path + "/data/Data.csv" # hdfs文件路径 client = Client(HDFS_HOSTS1) # 训练数据读取 with client.read(train_FILENAME) as tr_s: tr_content = tr_s.read() tr_s = str(tr_content, 'utf-8') # 确保文件写入完毕 tr_file = open("trainData.csv", "w") tr_file.flush() os.fsync(tr_file) tr_file.write(tr_s) tr_file.close() df_train = pd.read_csv("trainData.csv", header=0) print(df_train) # 测试数据读取 with client.read(test_FILENAME) as te_fs: te_content = te_fs.read() te_s = str(te_content, 'utf-8') # 确保文件写入完毕 te_file = open("testData.csv", "w") te_file.flush() os.fsync(te_file) te_file.write(te_s) te_file.close() df_test = pd.read_csv("testData.csv", header=0) print(df_test) min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) test_data_num = df_train.shape[0] train_data_num = df_train.shape[0] # 处理预测集 df_test = min_max_scaler.fit_transform(df_test) df_test = np.array(df_test) # 数据处理和清洗 cols = [tmp_i for tmp_i in df_train.columns if tmp_i not in [target]] X = df_train[cols] X = np.array(X) X = min_max_scaler.fit_transform(X) Y = df_train[target] Y = np.array(Y) # 训练集数据分割 X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=train_split_ratio) # 使用 scikit learn 中的LR模型进行训练 clf = LogisticRegression(penalty, dual, tol, C, random_state, multi_class, solver='liblinear') clf.fit(X_train, Y_train) # 准确率train_acc train_acc = clf.score(X_test, Y_test) print('score Scikit learn: ', train_acc) # 精确率train_precision_score train_precision_score = precision_score(Y_test, clf.predict(X_test)) # 召回率train_recall_score train_recall_score = recall_score(Y_test, clf.predict(X_test)) # F1_Score train_f1_score = f1_score(Y_test, clf.predict(X_test)) # roc_auc_score train_roc_auc_score1 = roc_auc_score(Y_test, clf.predict(X_test)) # 使用 scikit learn 中的LR模型进行预测 result = clf.predict(df_test) # print(result) # 设置终止时间,并计算总时间 train_end = time.time() train_seconds = train_end - start_time m, s = divmod(train_seconds, 60) h, m = divmod(m, 60) time_trains_all = "%02d:%02d:%02d" % (h, m, s) # ++++++++++++++++++++++++++++++++++++++++训练结果保存+++++++++++++++++++++++++++++++++++++++# ## 保存摘要模型报告文件 # abstract_path = HDFS_HOSTS1 + output_path + '/abstract/data/' abstract_path = output_path + '/abstract/data/' f = open('abstract.csv', mode='w', newline='') fileheader = [ 'FrameWork', 'Version', 'model', 'accuracy', 'time_trains_start', 'time_trains_all', 'test_data_num', 'train_data_num' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.FrameWork = 'Scikit-learn' csv_dict.Version = sklearn.__version__ csv_dict.model = '%s' % LogisticRegression csv_dict.accuracy = str(train_acc) csv_dict.time_trains_start = time_trains_start csv_dict.time_trains_all = time_trains_all csv_dict.test_data_num = str(test_data_num) csv_dict.train_data_num = str(train_data_num) w.writerow(csv_dict) f.close() client.delete(abstract_path + 'abstract.csv') client.upload(abstract_path + 'abstract.csv', 'abstract.csv') # if len(client.list(abstract_path)): # client.delete(abstract_path + 'abstract.csv') # client.upload(abstract_path + 'abstract.csv', 'abstract.csv') # else: # client.upload(abstract_path + 'abstract.csv', 'abstract.csv') ##保存模型版本信息csv文件 version_path = output_path + '/msg/data/' f = open('msg.csv', mode='w', newline='') fileheader = [ 'accuracy', 'time_trains_start', 'time_trains_all', 'test_data_num', 'train_data_num' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.accuracy = str(train_acc) csv_dict.time_trains_start = time_trains_start csv_dict.time_trains_all = time_trains_all csv_dict.test_data_num = str(test_data_num) csv_dict.train_data_num = str(train_data_num) w.writerow(csv_dict) f.close() client.delete(version_path + 'msg.csv') client.upload(version_path + 'msg.csv', 'msg.csv') ## 保存训练评价指标模型报告文件 file_csv_path = output_path + '/evaluation/data/' f = open('evaluation.csv', mode='w', newline='') fileheader = [ 'accuracy', 'train_precision_score', 'train_recall_score', 'train_f1_score', 'train_roc_auc_score1' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.accuracy = str(train_acc) csv_dict.train_precision_score = train_precision_score csv_dict.train_recall_score = train_recall_score csv_dict.train_f1_score = train_f1_score csv_dict.train_roc_auc_score1 = train_roc_auc_score1 w.writerow(csv_dict) f.close() client.delete(file_csv_path + 'evaluation.csv') client.upload(file_csv_path + 'evaluation.csv', 'evaluation.csv') # 保存测试集预测结果文件 file_csv_path = output_path + '/result/data/' # 字典中的key值即为csv中列名 dataframe = pd.DataFrame({target: result}) # 将DataFrame存储为csv,index表示是否显示行名,default=True dataframe.to_csv("result.csv", index=False, sep=',') client.delete(file_csv_path + 'result.csv') client.upload(file_csv_path + 'result.csv', 'result.csv')
class HDFSUtil: def __init__(self, url): self._client = Client(url) def make_dir(self, hdfs_path): """ 支持递归创建多级目录 :param hdfs_path: :return: """ self._client.makedirs(hdfs_path) def delete_hdfs_file(self, hdfs_path): """ 删除HDFS文件 如果是目录, 必须为空 :param hdfs_path: :return: """ self._client.delete(hdfs_path) def delete_hdfs_dir(self, hdfs_dir): """ 删除HDFS文件/目录 如果目录不为空, 递归删除 :param hdfs_dir: :return: """ dir_list = self.hdfs_dir_list(hdfs_dir) if dir_list is None or len(dir_list) == 0: print('Delete File: {0}'.format(hdfs_dir)) self._client.delete(hdfs_dir) else: for file_name in dir_list: self.delete_hdfs_dir(hdfs_dir + '/' + file_name) print('Delete Dir: {0}'.format(hdfs_dir)) self._client.delete(hdfs_dir) def upload_to_hdfs(self, local_path, hdfs_path): """ 将本地文件/目录上传到HDFS上 如果目录不存在, 会自动创建 :param local_path: :param hdfs_path: :return: """ self._client.upload(hdfs_path, local_path, cleanup=True) def download_from_hdfs(self, hdfs_path, local_path): """ 将HDFS上的文件/目录下载到本地 :param hdfs_path: :param local_path: :return: """ self._client.download(hdfs_path, local_path, overwrite=True) def write_to_hdfs(self, hdfs_path, data, overwrite=False, append=True): """ 追加: overwrite=false, append=true => Default 复写: overwrite=true, append=false overwrite和append逻辑必须互斥 :param hdfs_path: :param data: :param overwrite: Boolean 是否复写 :param append: Boolean 是否追加 :return: """ if not self._client.content(hdfs_path, strict=False): print('File Not exist in HDFS') self._client.write(hdfs_path, data, overwrite=overwrite, append=append) def move_or_rename(self, hdfs_src_path, hdfs_dst_path): """ 文件移动/重命名 :param hdfs_src_path: :param hdfs_dst_path: :return: """ self._client.rename(hdfs_src_path, hdfs_dst_path) def hdfs_dir_list(self, hdfs_path): """ 获取指定目录下的文件 当hdfs_path不是目录, 捕获异常并返回None :param hdfs_path: :return: List[filename] or None """ try: return self._client.list(hdfs_path, status=False) except HdfsError: return None
if __name__ == '__main__': hdfs_ip = "192.168.146.133" hdfs_version = 3 hdfs_root = "~/test" filepath = r"C:\Users\daqige\PycharmProjects\newLeetCode\convert.py" hdfs_addr = "http://" + hdfs_ip + ":" + str(9870 if (hdfs_version == 3) else 90070) client = Client(hdfs_addr) # print("创建文件夹") # client.makedirs(hdfs_root) # print(client.list("/")) # # print("上传文件") # client.upload(hdfs_root, filepath) # print(client.list(hdfs_root)) # # print("修改文件名") # client.rename(hdfs_root + "/convert.py", hdfs_root + "/ubuntu.py") # print(client.list(hdfs_root)) print("下载文件") client.download(hdfs_root + "/ubuntu.py", ".") print(os.listdir(".")) print("删除文件") client.delete(hdfs_root + "/ubuntu.py") print(client.list(hdfs_root))
def interface(train_path, test_path, output_path, target, chaid_ratio, train_split_ratio=0.3, n_estimators=100, max_depth=5, min_samples_split=3, min_samples_leaf=2, min_split_gain=0.0, colsample_bytree="log2", subsample=0.8, random_state=100): # 设置起始时间 time.localtime() time_trains_start = time.strftime('%Y{y}%m{m}%d{d} %H{h}%M{f}%S{s}'.format( y='/', m='/', d='', h=':', f=':', s='')) start_time = time.time() # 设置输入文件路径 train_FILENAME = train_path + "/data/Data.csv" # hdfs文件路径 test_FILENAME = test_path + "/data/Data.csv" # hdfs文件路径 client = Client(HDFS_HOSTS1) # 训练数据读取 with client.read(train_FILENAME) as tr_s: tr_content = tr_s.read() tr_s = str(tr_content, 'utf-8') # 确保文件写入完毕 tr_file = open("trainData.csv", "w") tr_file.flush() os.fsync(tr_file) tr_file.write(tr_s) tr_file.close() df_train = pd.read_csv("trainData.csv", header=0) print(df_train) # 测试数据读取 with client.read(test_FILENAME) as te_fs: te_content = te_fs.read() te_s = str(te_content, 'utf-8') # 确保文件写入完毕 te_file = open("testData.csv", "w") te_file.flush() os.fsync(te_file) te_file.write(te_s) te_file.close() df_test = pd.read_csv("testData.csv", header=0) print(df_test) test_data_num = df_train.shape[0] train_data_num = df_train.shape[0] # 卡方检测选出和label列最相关的前chaid_ratio(默认值为前80%)的列 ch2 = SelectKBest(chi2, k=int(df_train.shape[1] * chaid_ratio)) chi_df_train = pd.DataFrame(ch2.fit_transform(df_train, df_train[target])) label_df = df_train[target] # wine数据集 和 sonar 数据集 clf = RandomForestClassifier(n_estimators, max_depth, min_samples_split, min_samples_leaf, min_split_gain, colsample_bytree, subsample, random_state) # 数据集分割与训练 train_count = int(train_split_ratio * len(chi_df_train)) clf.fit(chi_df_train.ix[:train_count], label_df.ix[:train_count]) train_acc = metrics.accuracy_score( label_df.ix[:train_count], clf.predict(chi_df_train.ix[:train_count])) print("模型的准确率:", train_acc) # 精确率 train_precision_score = metrics.precision_score( label_df.ix[:train_count], clf.predict(chi_df_train.ix[:train_count])) # 召回率 train_recall_score = metrics.recall_score( label_df.ix[:train_count], clf.predict(chi_df_train.ix[:train_count])) # F1_Score train_f1_score = metrics.f1_score( label_df.ix[:train_count], clf.predict(chi_df_train.ix[:train_count])) # roc_auc_score train_roc_auc_score1 = metrics.roc_auc_score( label_df.ix[:train_count], clf.predict(chi_df_train.ix[:train_count])) # 对测试集进行处理,保证其和训练集卡方检测后的列数一致 ch2_list = list(ch2.get_support()) ch2_list.pop() df_test_head = list(df_test.columns) for x, y in zip(ch2_list, df_test_head): if x == False: df_test_head.remove(y) df_test = df_test[df_test_head] # 预测 result = clf.predict(df_test) # print(result) # 设置终止时间,并计算总时间 train_end = time.time() train_seconds = train_end - start_time m, s = divmod(train_seconds, 60) h, m = divmod(m, 60) time_trains_all = "%02d:%02d:%02d" % (h, m, s) # print(time_trains_start,time_trains_all) # ++++++++++++++++++++++++++++++++++++++++训练结果保存+++++++++++++++++++++++++++++++++++++++# ## 保存摘要模型报告文件 abstract_path = output_path + '/abstract/data/' f = open('abstract.csv', mode='w', newline='') fileheader = [ 'FrameWork', 'Version', 'model', 'accuracy', 'time_trains_start', 'time_trains_all', 'test_data_num', 'train_data_num' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.FrameWork = 'Scikit-learn' csv_dict.Version = sklearn.__version__ csv_dict.model = '%s' % RandomForestClassifier csv_dict.accuracy = str(train_acc) csv_dict.time_trains_start = time_trains_start csv_dict.time_trains_all = time_trains_all csv_dict.test_data_num = str(test_data_num) csv_dict.train_data_num = str(train_data_num) w.writerow(csv_dict) f.close() client.delete(abstract_path + 'abstract.csv') client.upload(abstract_path + 'abstract.csv', 'abstract.csv') ##保存模型版本信息csv文件 version_path = output_path + '/msg/data/' f = open('msg.csv', mode='w', newline='') fileheader = [ 'accuracy', 'time_trains_start', 'time_trains_all', 'test_data_num', 'train_data_num' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.accuracy = str(train_acc) csv_dict.time_trains_start = time_trains_start csv_dict.time_trains_all = time_trains_all csv_dict.test_data_num = str(test_data_num) csv_dict.train_data_num = str(train_data_num) w.writerow(csv_dict) f.close() client.delete(version_path + 'msg.csv') client.upload(version_path + 'msg.csv', 'msg.csv') ## 保存训练评价指标模型报告文件 file_csv_path = output_path + '/evaluation/data/' f = open('evaluation.csv', mode='w', newline='') fileheader = [ 'accuracy', 'train_precision_score', 'train_recall_score', 'train_f1_score', 'train_roc_auc_score1' ] w = csv.DictWriter(f, fileheader) w.writeheader() csv_dict = edict() csv_dict.accuracy = str(train_acc) csv_dict.train_precision_score = train_precision_score csv_dict.train_recall_score = train_recall_score csv_dict.train_f1_score = train_f1_score csv_dict.train_roc_auc_score1 = train_roc_auc_score1 w.writerow(csv_dict) f.close() client.delete(file_csv_path + 'evaluation.csv') client.upload(file_csv_path + 'evaluation.csv', 'evaluation.csv') # 保存测试集预测结果文件 file_csv_path = output_path + '/result/data/' dataframe = pd.DataFrame({target: result}) dataframe.to_csv("result.csv", index=False, sep=',') client.delete(file_csv_path + 'result.csv') client.upload(file_csv_path + 'result.csv', 'result.csv')
# 写入文件(覆盖) client.write(file_name, data="hello hdfs !", overwrite=True) # 写入文件(追加) client.write(file_name, data="hello way !", overwrite=False, append=True) # 读取文件内容 with client.read(file_name, encoding='utf-8') as f: print(f.read()) # 文件下载 client.download(file_name, loacl_file_name, overwrite=True) # 文件上传 client.upload(file_name + '111', loacl_file_name, cleanup=True) # 删除文件 client.delete(file_name2) # 文件重命名 client.rename(file_name, file_name2) # 文件夹底下文件 files = client.list(file_dir, status=False) for file in files: print(file) # 删除文件夹(递归删除、谨慎) # client.delete(file_dir, recursive=True)
class RF_HDFS(object): def __init__(self): self.client = None self.directory = None def connect_and_login(self, **kwargs): import requests host = None port = None user = None password = None root = None timeout = None proxy = None if 'host' in kwargs: host = kwargs['host'] if 'port' in kwargs: port = kwargs['port'] if 'kdc' in kwargs: kdc = kwargs['kdc'] if 'user' in kwargs: user = kwargs['user'] if 'password' in kwargs: password = kwargs['password'] if 'root' in kwargs: root = kwargs['root'] if 'proxy' in kwargs: proxy = kwargs['proxy'] if 'timeout' in kwargs: timeout = kwargs['timeout'] self.session = requests.Session() adapter = requests.adapters.HTTPAdapter(pool_maxsize=0) self.session.mount('http://', adapter) self.session.mount('https://', adapter) self.session.headers.update({'Connection':'Keep-Alive'}) self.connectionStatus = False try: timeout = int(timeout) url = "http://" + host + ":" + str(port) hdfsLogin = WebHDFS(url, kdc) cookieStr = hdfsLogin.authenticate(user, password) if cookieStr != None: cookieList = cookieStr.split('=', 1) cookieDict = {cookieList[0]: cookieList[1]} requests.utils.add_dict_to_cookiejar(self.session.cookies, cookieDict) self.client = Client(url, root=root, proxy=proxy, timeout=timeout, session=self.session) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) self.connectionStatus = True return self.client def checkConnectionStatus(self): return self.connectionStatus def list_dir(self, directory): output = [] try: if directory != None: output = self.client.list(directory, status=True) else: output = self.client.list(self.client.root, status=True) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def list_names(self, directory): output = [] try: if directory != None: output = self.client.list(directory, status=False) else: output = self.client.list(self.client.root, status=False) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def upload(self, remote_path, local_path, overwrite=False, permission=None): output = None try: output = self.client.upload(remote_path, local_path, overwrite, permission=permission) except HdfsError as hdfsError: # For some reason this exception includes the entire stack trace after # the error message, so split on '\n' and only return the first line. error = str(hdfsError).splitlines()[0] raise HdfsLibraryError(error) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def download(self, remote_path, local_path, overwrite=False): output = None try: output = self.client.download(remote_path, local_path, overwrite) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def mkdir(self, directory, permission): try: # no return value self.client.makedirs(directory, permission=permission) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def rmdir(self, directory): try: # no return value if self.client.delete(directory, recursive=True) == False: raise HdfsLibraryError("Directory does not exist: %r", directory) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def rename(self, src_file, dst_file): try: # no return value self.client.rename(src_file, dst_file) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def delete(self, file): try: # no return value if self.client.delete(file) == False: raise HdfsLibraryError("File does not exist: %r", file) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_time(self, file, mod_time): try: # no return value self.client.set_times(file, -1, mod_time) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_owner(self, file, owner, group): try: # no return value self.client.set_owner(file, owner=owner, group=group) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_permission(self, file, permission): try: # no return value self.client.set_permission(file, permission=permission) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def set_acl(self, file, aclspec): try: # no return value self.client.set_acl(file, aclspec=aclspec) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) def status(self, path): output = '' try: output = self.client.status(path) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def checksum(self, path): output = '' try: output = self.client.checksum(path) except HdfsError as hdfsError: raise HdfsLibraryError(str(hdfsError)) except Exception as exception: raise HdfsLibraryError(str(exception)) return output def close(self): self.session.close()