def load_usps(data_path = os.path.join(getVariableByComputerName("n2d_experiments"), 'usps', 'data')): createDirIfNotExist(data_path) file_name_tr = os.path.join(data_path, 'usps_train.jf') file_name_te = os.path.join(data_path, 'usps_test.jf') link_adr_path = 'https://raw.githubusercontent.com/cvjena/ITAL/master/data/usps_<trte>.jf' if not os.path.exists(file_name_tr): download_file(link_adr_path.replace("<trte>", "train"), save2path=data_path, savefilename='usps_train.jf') #os.system('wget http://www-i6.informatik.rwth-aachen.de/~keysers/usps_train.jf.gz -P %s' % data_path) download_file(link_adr_path.replace("<trte>", "test"), save2path=data_path, savefilename='usps_test.jf') #os.system('wget http://www-i6.informatik.rwth-aachen.de/~keysers/usps_test.jf.gz -P %s' % data_path) with open(file_name_tr) as f: data = f.readlines() data = data[1:-1] data = [list(map(float, line.split())) for line in data] data = np.array(data) data_train, labels_train = data[:, 1:], data[:, 0] with open(file_name_te) as f: data = f.readlines() data = data[1:-1] data = [list(map(float, line.split())) for line in data] data = np.array(data) data_test, labels_test = data[:, 1:], data[:, 0] x = np.concatenate((data_train, data_test)).astype('float64') y = np.concatenate((labels_train, labels_test)) print('USPS samples', x.shape) return x, y
def load_pendigits(data_path = os.path.join(getVariableByComputerName("n2d_experiments"), 'pendigits', 'data')): createDirIfNotExist(data_path) file_name_tr = os.path.join(data_path, 'pendigits.tra') file_name_te = os.path.join(data_path, 'pendigits.tes') link_adr_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits<file_ending>' if not os.path.exists(file_name_tr): os.makedirs(data_path, exist_ok=True) download_file(link_adr_path.replace("<file_ending>", ".tra"), save2path=data_path, savefilename='pendigits.tra') #os.system('wget https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra -P %s' % data_path) download_file(link_adr_path.replace("<file_ending>", ".tes"), save2path=data_path, savefilename='pendigits.tes') #os.system('wget https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tes -P %s' % data_path) download_file(link_adr_path.replace("<file_ending>", ".names"), save2path=data_path, savefilename='pendigits.names') #os.system('wget https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.names -P %s' % data_path) # load training data with open(file_name_tr) as file: data = file.readlines() data = [list(map(float, line.split(','))) for line in data] data = np.array(data).astype(np.float32) data_train, labels_train = data[:, :-1], data[:, -1] # load testing data with open(file_name_te) as file: data = file.readlines() data = [list(map(float, line.split(','))) for line in data] data = np.array(data).astype(np.float32) data_test, labels_test = data[:, :-1], data[:, -1] x = np.concatenate((data_train, data_test)).astype('float32') y = np.concatenate((labels_train, labels_test)) x /= 100. y = y.astype('int') return x, y
def get_create_folders(params_dict): data_path_base = params_dict["data_path_base"] data_ident = 'data_' + params_dict["data_ident"] base_dir = funcH.getVariableByComputerName('base_dir') # xx/DataPath or xx/DataFolder results_dir = os.path.join(base_dir, 'sup', 'results_mi' + str(params_dict["model_id"])) models_dir = os.path.join(base_dir, 'sup', 'models_mi' + str(params_dict["model_id"])) data_params_folder = os.path.join(base_dir, 'sup', 'data_mi', data_ident) data_path_base = os.path.join(base_dir, data_path_base, "imgs") result_fold = os.path.join(base_dir, 'sup', 'preds_' + params_dict["modelName"], 'pred_' + params_dict["exp_ident"]) path_dict = { "results": results_dir, # folder="~/DataFolder/sup/results_mi1" "models": models_dir, "data_base": data_path_base, # original path of data to load "data_params_folder": data_params_folder, # data params folder "result_fold": result_fold, # to save the predictions and labels } funcH.createDirIfNotExist(results_dir) funcH.createDirIfNotExist(models_dir) funcH.createDirIfNotExist(data_params_folder) funcH.createDirIfNotExist(result_fold) return path_dict
def load_har(data_path = os.path.join(getVariableByComputerName("n2d_experiments"), 'har', 'data')): # load this dataset this way ?? # https://pypi.org/project/kcc2020-tutorial-HAR-dataset/ # entire_dataset = load_har_all() createDirIfNotExist(data_path) fold_train = os.path.join(data_path, 'train') fold_test = os.path.join(data_path, 'test') createDirIfNotExist(fold_train) createDirIfNotExist(fold_test) fname_train_x = os.path.join(fold_train, 'X_train.txt') fname_train_y = os.path.join(fold_train, 'y_train.txt') fname_test_x = os.path.join(fold_test, 'X_test.txt') fname_test_y = os.path.join(fold_test, 'y_test.txt') # https://github.com/mollybostic/cleaning-data-assignment/tree/master/UCI%20HAR%20Dataset # for windows = https://sourceforge.net/projects/gnuwin32/files/wget/1.11.4-1/wget-1.11.4-1-setup.exe/download # https://stackoverflow.com/questions/29113456/wget-not-recognized-as-internal-or-external-command link_adr_path = 'https://raw.githubusercontent.com/mollybostic/cleaning-data-assignment/master/UCI%20HAR%20Dataset/<trte>/<Xy>_<trte>.txt' if not os.path.isfile(fname_train_x): print('downloading X_train.txt(66.0MB)') download_file(link_adr_path.replace("<trte>", "train").replace("<Xy>", "X"), save2path=fold_train, savefilename='X_train.txt') #os.system("wget --no-verbose 'https://raw.githubusercontent.com/mollybostic/cleaning-data-assignment/master/UCI HAR Dataset/train/X_train.txt' -P %s" % fold_train) print('downloading y_train.txt(14.7kB)') download_file(link_adr_path.replace("<trte>", "train").replace("<Xy>", "y"), save2path=fold_train, savefilename='y_train.txt') #os.system("wget --no-verbose 'https://raw.githubusercontent.com/mollybostic/cleaning-data-assignment/master/UCI HAR Dataset/train/y_train.txt' -P %s" % fold_train) print('downloading X_test.txt(26.5MB)') download_file(link_adr_path.replace("<trte>", "test").replace("<Xy>", "X"), save2path=fold_test, savefilename='X_test.txt') #os.system("wget --no-verbose 'https://raw.githubusercontent.com/mollybostic/cleaning-data-assignment/master/UCI HAR Dataset/test/X_test.txt' -P %s" % fold_test) print('downloading y_test.txt(5.9kB)') download_file(link_adr_path.replace("<trte>", "test").replace("<Xy>", "y"), save2path=fold_test, savefilename='y_test.txt') #os.system("wget --no-verbose 'https://raw.githubusercontent.com/mollybostic/cleaning-data-assignment/master/UCI HAR Dataset/test/y_test.txt' -P %s" % fold_test) x_train = pd.read_csv(fname_train_x, sep=r'\s+', header=None) y_train = pd.read_csv(fname_train_y, header=None) x_test = pd.read_csv(fname_test_x, sep=r'\s+', header=None) y_test = pd.read_csv(fname_test_y, header=None) x = np.concatenate((x_train, x_test)) y = np.concatenate((y_train, y_test)) # # labels start at 1 so.. y = y - 1 y = y.reshape((y.size,)) y_names = {0: 'Walking', 1: 'Upstairs', 2: 'Downstairs', 3: 'Sitting', 4: 'Standing', 5: 'Laying', } os.error("not implemented") return x, y, y_names
def get_args(argv): global debug_string_out parser = argparse.ArgumentParser( description='(Not Too) Deep', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset', default='mnist', ) parser.add_argument('--ae_weights', default=None) parser.add_argument('--experiments_folder_base', default=funcH.getVariableByComputerName("n2d_experiments")) parser.add_argument("--mode", default='client') parser.add_argument("--port", default=52162) parser.add_argument('--gpu', default=0, ) parser.add_argument('--n_clusters', default=10, type=int) parser.add_argument('--batch_size', default=256, type=int) parser.add_argument('--pretrain_epochs', default=1000, type=int) parser.add_argument('--umap_dim', default=2, type=int) parser.add_argument('--umap_neighbors', default=10, type=int) parser.add_argument('--umap_min_dist', default="0.00", type=str) parser.add_argument('--umap_metric', default='euclidean', type=str) parser.add_argument('--cluster', default='GMM', type=str) parser.add_argument('--manifold_learner', default='UMAP', type=str) parser.add_argument('--visualize', default=False, type=bool) parser.add_argument('--rerun_last_plots', default=False, type=bool) args = funcH._parse_args(parser, argv, print_args=True) debug_string_out = funcH.print_and_add('-' * 80) experiment_names_and_folders = { "exp_date_str": str(datetime.now().strftime("%Y%m%d_")).replace('-', ''), # %M%S, "exp_base_str": "_".join([args.dataset, "c" + str(args.cluster)+ str(args.n_clusters), "e" + str(args.pretrain_epochs)]), "folder_umap_data": os.path.join(args.experiments_folder_base, "exported_manifolds"), "folder_ae_weights": os.path.join(args.experiments_folder_base, "weights"), } experiment_names_and_folders["exp_extended"] = experiment_names_and_folders["exp_base_str"] + "_" + "_".join([args.manifold_learner + "ud" + str(args.umap_dim), "un" + str(args.umap_neighbors)]) experiment_names_and_folders["folder_experiment"] = os.path.join(args.experiments_folder_base, args.dataset, experiment_names_and_folders["exp_date_str"] + experiment_names_and_folders["exp_extended"]) experiment_names_and_folders["file_name_ae_weights_base"] = "aew_" + "_".join([args.dataset, "c" + str(args.n_clusters), "e" + str(args.pretrain_epochs)]) experiment_names_and_folders["file_name_ae_weights_full"] = os.path.join(experiment_names_and_folders["folder_ae_weights"], experiment_names_and_folders["file_name_ae_weights_base"] + '.npy') experiment_names_and_folders["file_name_umap_data_base"] = "ulp" + experiment_names_and_folders["exp_extended"] experiment_names_and_folders["file_name_umap_data_full"] = os.path.join(experiment_names_and_folders["folder_umap_data"], experiment_names_and_folders["file_name_umap_data_base"] + '.npy') experiment_names_and_folders["file_name_arguments_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'args_' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.txt') experiment_names_and_folders["file_name_ae_params_text_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'args_autoencode_' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.txt') experiment_names_and_folders["file_name_plot_fig_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'plot_' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '_<plot_id>.png') experiment_names_and_folders["file_name_plot_csv_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'csv_' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.csv') experiment_names_and_folders["file_name_clusters_after_manifold_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'clusters_after_manifold-' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.txt') experiment_names_and_folders["file_name_clusters_before_manifold_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'clusters_before_manifold-' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.txt') experiment_names_and_folders["file_name_debug_string_out_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'debug_string_out-' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.txt') experiment_names_and_folders["file_name_result_csv_file_full"] = os.path.join(args.experiments_folder_base, 'results.csv') experiment_names_and_folders["file_name_data_before_manifold"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'data_' + experiment_names_and_folders["exp_extended"] + '_before.npz') experiment_names_and_folders["file_name_data_after_manifold"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'data_' + experiment_names_and_folders["exp_extended"] + '_after.npz') experiment_names_and_folders["file_name_cluster_obj"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'cluster_obj_' + experiment_names_and_folders["exp_extended"] + '_<bef_aft>.dictionary') experiment_names_and_folders["file_name_silhouette_results"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'silhouette_results_' + experiment_names_and_folders["exp_extended"] + '_<bef_aft>.npy') experiment_names_and_folders["file_name_results"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'results_' + experiment_names_and_folders["exp_extended"] + '.dictionary') args.experiment_names_and_folders = experiment_names_and_folders # 4 folders folder_{experiment, umap_data, ae_weights} funcH.createDirIfNotExist(experiment_names_and_folders["folder_experiment"]) funcH.createDirIfNotExist(experiment_names_and_folders["folder_umap_data"]) funcH.createDirIfNotExist(experiment_names_and_folders["folder_ae_weights"]) with open(experiment_names_and_folders["file_name_arguments_full"], 'w') as f: f.write("\n".join(argv)) return args
def create_data_folder(userIDTest, userIDValid, nos, to_folder, base_dir="/home/doga/DataFolder"): # base_dir = funcH.getVariableByComputerName('base_dir') # xx/DataPath or xx/DataFolder data_path_base = "neuralNetHandImages_nos" + str(nos) + "_rs224" data_path = os.path.join(base_dir, data_path_base, "imgs") # original path of data to load data_ident = "te{:d}_va{:d}_nos{:d}".format(userIDTest, userIDValid, nos) train_path = os.path.join(to_folder, "conv_data_" + data_ident, 'data_tr') valid_path = os.path.join(to_folder, "conv_data_" + data_ident, 'data_va') test_path = os.path.join(to_folder, "conv_data_" + data_ident, 'data_te') createDirIfNotExist(train_path) createDirIfNotExist(valid_path) createDirIfNotExist(test_path) cnt_table_fileName = os.path.join( to_folder, "conv_data_" + data_ident, "cnt_table" + "_te{:d}_va{:d}_nos{:d}".format(userIDTest, userIDValid, nos) + ".csv") targets = getFolderList(dir2Search=data_path, sortList=True).tolist() table_rows = targets.copy() table_rows.append("total") cnt_table = pd.DataFrame(index=table_rows, columns=["train", "validation", "test", "total"]) for col in cnt_table.columns: cnt_table[col].values[:] = 0 if os.path.isdir(train_path) and os.path.isdir( valid_path) and os.path.isdir(test_path): rmtree(train_path, ignore_errors=True) rmtree(valid_path, ignore_errors=True) rmtree(test_path, ignore_errors=True) create_sub_folders(targets, train_path) create_sub_folders(targets, valid_path) create_sub_folders(targets, test_path) for col in cnt_table.columns: cnt_table[col].values[:] = 0 spaces_list = [] for t in targets: print(f"Start copying target {t} -->") source_path = os.path.join(data_path, t) samples = getFileList(dir2Search=source_path, endString=".png") # according to user_id_dict cnt_table["total"][t] = len(samples) cnt_table["total"]["total"] += len(samples) train_samples = [] for s in samples: sample_dict = s.split(sep="_") # <3 signID><1 userID><2 repID> # int_id = int(sample_dict[1]) # user_id = ((int_id - int_id.__mod__(100))/100).__mod__(10) # user_id_str = sample_dict[1][3] user_id_int = int(sample_dict[1][3]) # if user_id_dict["valid"] == user_id_int: # copyfile(os.path.join(source_path, s), os.path.join(valid_path, t, s)) # cnt_table["validation"][t] += 1 if userIDTest == user_id_int: copyfile(os.path.join(source_path, s), os.path.join(test_path, t, s)) cnt_table["test"][t] += 1 elif userIDValid == user_id_int: copyfile(os.path.join(source_path, s), os.path.join(valid_path, t, s)) cnt_table["validation"][t] += 1 else: copyfile(os.path.join(source_path, s), os.path.join(train_path, t, s)) cnt_table["train"][t] += 1 cnt_table["train"]["total"] += cnt_table["train"][t] cnt_table["validation"]["total"] += cnt_table["validation"][t] cnt_table["test"]["total"] += cnt_table["test"][t] print( f"Copied {t} --> train({cnt_table['train'][t]}),valid({cnt_table['validation'][t]}),test({cnt_table['test'][t]})" ) pd.DataFrame.to_csv(cnt_table, path_or_buf=cnt_table_fileName) print('\n'.join(map(str, spaces_list))) samples_list_filename = cnt_table_fileName.replace(".csv", "_sl.txt") with open(samples_list_filename, 'w') as f: for i, item in enumerate(spaces_list): f.write("%s - %s\n" % (str(targets[i]), str(item))) return data_ident
def create_sub_folders(targets, dir_path): """Creates empty folders which have the same name as given targets in dir_path""" for t in targets: createDirIfNotExist(os.path.join(dir_path, t))
else: ae = modelLoader.loadModel("model_tex") prediction = ae.predict(train_images[0:199,:,:,:], verbose=1, batch_size=100) x =prediction[0].reshape(28,28) plt.imshow(x) plt.show() else: exp_name = 'cnnAE' results_dir = funcH.getVariableByComputerName('results_dir') outdir = os.path.join(results_dir, 'results', exp_name) csv_name = os.path.join(results_dir, 'epochs') + os.sep + exp_name + '.csv' model_name = os.path.join(results_dir, 'models') + os.sep + exp_name + '.h5' funcH.createDirIfNotExist(os.path.join(results_dir, 'epochs')) funcH.createDirIfNotExist(os.path.join(results_dir, 'models')) funcH.createDirIfNotExist(outdir) checkpointer = ModelCheckpoint(filepath=model_name, verbose=0, save_best_only=False, period=1) csv_logger = CSVLogger(csv_name, append=True, separator=';') #ES = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=50, verbose=0, mode='auto') #callbacks = [csv_logger, ES, checkpointer] feat_set, labels_all, detailedLabels_all = dataLoader.loadData_nnVidImages('/home/dg/DataPath/bdData') non_zero_labels = labels_all[np.where(labels_all)] ae = modelLoader.modelLoad_KHS() ae_tester = modelLoader.modelLoad_KHS_Clusters() ae.compile(optimizer="adam", loss="mse")
def main(argv): np.set_printoptions(formatter={"float_kind": lambda x: "%g" % x}) params_dict = parseArgs(argv) numOfSigns = params_dict["numOfSigns"] # 11 or 41 clusterModel = params_dict["clusterModel"] # 'KMeans', 'GMM_diag', 'Spectral' params_dict["hostName"] = socket.gethostname() initialLabelVec, expNameEnd = decode_initial_label_param(params_dict["initialLabel"]) clusterLabelUpdateInterval = params_dict["clusterLabelUpdateInterval"] print('you are running this train function on = <', params_dict["hostName"], '>') input_initial_resize, input_size, batch_size, num_workers = initSomeVals(params_dict) train_data_transform, valid_data_transform = getTransformFuncs(input_size, input_initial_resize) base_dir = funcH.getVariableByComputerName('base_dir') # dataPath and dataFolder data_dir = funcH.getVariableByComputerName('data_dir') # bdData results_dir = funcH.getVariableByComputerName('results_dir').replace("bdResults", "dcResults") labelsDir = funcH.getVariableByComputerName('results_dir').replace("bdResults", "dcLabels") modelsDir = os.path.join(base_dir, 'dcModels') nnVidsDir = os.path.join(base_dir, 'neuralNetHandVideos_' + str(numOfSigns)) expName = params_dict["modelName"] + '_' + \ params_dict["clusterModel"] + \ '_pd' + str(params_dict["posterior_dim"]) + \ '_clui' + str(params_dict["clusterLabelUpdateInterval"]) + \ '_' + str(numOfSigns) + \ expNameEnd labelSaveFolder = os.path.join(labelsDir, expName) resultMatFile = os.path.join(results_dir, 'rMF_' + expName) funcH.createDirIfNotExist(results_dir) funcH.createDirIfNotExist(labelsDir) funcH.createDirIfNotExist(modelsDir) funcH.createDirIfNotExist(labelSaveFolder) epochFr, epochTo = setEpochBounds(labelSaveFolder, params_dict["epochs"], params_dict["appendEpochBinary"]) train_dataset = HandShapeDataset(root_dir=nnVidsDir, istrain=True, transform=train_data_transform, datasetname='nnv') val_dataset = HandShapeDataset(root_dir=nnVidsDir, istrain=False, transform=valid_data_transform, datasetname='nnv') num_classes = np.unique(train_dataset.labels).size print('trainCnt = ', len(train_dataset)) print('valCnt = ', len(val_dataset)) model, optimizer, updatedModelFile = getModel(params_dict, modelsDir, expName) num_ftrs = model.fc.in_features print('num_classes = ', num_classes, ', num_ftrs = ', num_ftrs, flush=True) epochStartTime = time.time() dsLoad_train_train = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) dsLoad_train_featExtract = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers) model.eval() # evaluate the model to extract # trAccInit : to save as initial training accuracy # featTrInit : features to cluster, also saved as result features in -saveFeatsExtracted- # labelsTrInit : # predictionsTrInit : trAccInit, _, featTrInit, labelsTrInit, predictionsTrInit = runValidDs(model, dsLoad_train_featExtract, return_feats=True, layerSize=num_ftrs) saveFeatsExtracted(data_dir, epochFr, params_dict["modelName"], expName, featTrInit, labelsTrInit, predictionsTrInit) labelSaveFileName = labelSaveFolder + os.sep + 'labels_{:03d}.npz'.format(epochFr) predClusters, resultRow = iterate_1(featTrInit, labelsTrInit, predictionsTrInit, params_dict["posterior_dim"], labelSaveFileName, epochFr-1, epochTo, trAccInit, epochStartTime, clusterModel=clusterModel, initialLabelVec=initialLabelVec) train_dataset = updateTrainLabels(train_dataset, clusterLabelUpdateInterval, epochFr, predClusters=predClusters, initialLabelVec=initialLabelVec) resultMat = [] resultMat = resultMat + resultRow.tolist() if not os.path.isfile(resultMatFile): np.savetxt(resultMatFile, np.array(resultRow).reshape(1, -1), fmt='%4.3f', delimiter='*', newline=os.linesep, header='ep * tr_acc_epoch * nmi_lab * nmi_lab_nz * acc_lab * acc_lab_nz * nmi_pred * nmi_pred_nz * acc_pred * acc_pred_nz', footer='', comments='', encoding=None) else: f = open(resultMatFile, 'a') np.savetxt(f, np.array(resultRow).reshape(1, -1), fmt='%4.3f', delimiter='*', newline=os.linesep, header='', footer='', comments='', encoding=None) f.close() for ep in range(epochFr, epochTo): model.train() # Set model to training mode epochStartTime = time.time() _, _ = runTrainDs(model, optimizer, dsLoad_train_train) model.eval() tr_acc_epoch, _, features_avgPool, labels_avgPool, predictionsTr = \ runValidDs(model, dsLoad_train_featExtract, return_feats=True, layerSize=num_ftrs) labelSaveFileName = labelSaveFolder + os.sep + 'labels_{:03d}.npz'.format(ep+1) predClusters, resultRow = iterate_1(features_avgPool, labelsTrInit, predictionsTr, params_dict["posterior_dim"], labelSaveFileName, ep, epochTo, tr_acc_epoch, epochStartTime, clusterModel=clusterModel, initialLabelVec=initialLabelVec) resultMat = resultMat + resultRow.tolist() train_dataset = updateTrainLabels(train_dataset, clusterLabelUpdateInterval, ep+1, predClusters=predClusters) saveFeatsExtracted(data_dir, ep, params_dict["modelName"], expName, features_avgPool, labelsTrInit, predictionsTr) saveToResultMatFile(resultMatFile, resultRow) torch.save(model, f=updatedModelFile)
def create_dataset(path_dict, user_id_dict, params_dict): data_path = path_dict["data_base"] # original path of data to load data_params_folder = path_dict["data_params_folder"] # train data to create cnt_table_fileName = os.path.join(os.path.abspath(os.path.join(path_dict["data_params_folder"], os.pardir)), "cnt_table" + params_dict["exp_ident"] + ".csv") targets, cnt_vec_all = read_data(data_path) table_rows = targets.copy() table_rows.append("total") cnt_table = pd.DataFrame(index=table_rows, columns=["train", "validation", "test", "total"]) for col in cnt_table.columns: cnt_table[col].values[:] = 0 if os.path.isdir(data_params_folder) and os.path.isfile(cnt_table_fileName): try: cnt_table = pd.read_csv(cnt_table_fileName, header=0, sep="*", names=["train", "validation", "test", "total"]) return cnt_table except: rmtree(data_params_folder, ignore_errors=True) funcH.createDirIfNotExist(data_params_folder) for col in cnt_table.columns: cnt_table[col].values[:] = 0 np.random.seed(seed=params_dict["randomSeed"]) spaces_list = [] for t in targets: print(f"Start extracting target {t} -->") source_path = os.path.join(data_path, t) samples = os.listdir(source_path) #according to user_id_dict cnt_table["total"][t] = len(samples) cnt_table["total"]["total"] += len(samples) train_samples = [] for s in samples: sample_dict = s.split(sep="_") # <3 signID><1 userID><2 repID> # int_id = int(sample_dict[1]) # user_id = ((int_id - int_id.__mod__(100))/100).__mod__(10) # user_id_str = sample_dict[1][3] user_id_int = int(sample_dict[1][3]) #if user_id_dict["valid"] == user_id_int: # copyfile(os.path.join(source_path, s), os.path.join(valid_path, t, s)) # cnt_table["validation"][t] += 1 #### get hog, skel and surf norm of the sample if user_id_dict["test"] == user_id_int: # copyfile(os.path.join(source_path, s), os.path.join(test_path, t, s)) #### add to test group cnt_table["test"][t] += 1 else: # copyfile(os.path.join(source_path, s), os.path.join(train_path, t, s)) #### add to train group # train_samples.append(os.path.join(train_path, t, s)) cnt_table["train"][t] += 1 # deal with validation samples num_of_train_samples = len(train_samples) perm_list = np.random.permutation(num_of_train_samples) spaces = np.array(np.floor(np.linspace(0.0, num_of_train_samples, num=6)), dtype=int) fr, to = spaces[user_id_dict["cross_valid_id"]-1], spaces[user_id_dict["cross_valid_id"]] spaces_list.append(list(np.array([fr, to])) + list([-1])+ list(perm_list[fr:to])) #### move samples fr:to from train to valid # for i in range(fr, to): # sample_to_move = train_samples[perm_list[i]] # sample_new_name = sample_to_move.replace(train_path, valid_path) # os.rename(sample_to_move, sample_new_name) # cnt_table["train"][t] -= 1 # cnt_table["validation"][t] += 1 cnt_table["train"]["total"] += cnt_table["train"][t] cnt_table["validation"]["total"] += cnt_table["validation"][t] cnt_table["test"]["total"] += cnt_table["test"][t] print(f"Extracted {t} --> train({cnt_table['train'][t]}),valid,({cnt_table['validation'][t]})test({cnt_table['test'][t]})") pd.DataFrame.to_csv(cnt_table, path_or_buf=cnt_table_fileName) print('\n'.join(map(str, spaces_list))) samples_list_filename = cnt_table_fileName.replace(".csv", "_sl.txt") with open(samples_list_filename, 'w') as f: for i, item in enumerate(spaces_list): f.write("%s - %s\n" % (str(targets[i]), str(item))) return cnt_table
def get_create_folders(params_dict): data_path_base = params_dict["data_path_base"] data_ident = 'data_' + params_dict["data_ident"] base_dir = funcH.getVariableByComputerName( 'base_dir') # xx/DataPath or xx/DataFolder results_dir = os.path.join(base_dir, 'sup', 'results_' + params_dict["modelName"]) models_dir = os.path.join(base_dir, 'sup', 'models_' + params_dict["modelName"]) data_path_fill = os.path.join(base_dir, 'sup', 'data', data_ident) exp_ident_str = 'rs' + str(params_dict["randomSeed"]).zfill(2) data_path_train = os.path.join( data_path_fill, data_path_base + '_' + exp_ident_str + '_tr') data_path_valid = os.path.join( data_path_fill, data_path_base + '_' + exp_ident_str + '_va') data_path_test = os.path.join(data_path_fill, data_path_base + '_' + exp_ident_str + '_te') data_path_base = os.path.join(base_dir, data_path_base, "imgs") result_fold = os.path.join(base_dir, 'sup', 'preds_' + params_dict["modelName"], 'pred_' + params_dict["exp_ident"]) path_dict = { "results": results_dir, # folder="~/DataFolder/sup/results" "models": models_dir, "data_base": data_path_base, # original path of data to load "train": data_path_train, # train data to create "valid": data_path_valid, # valid data to create "test": data_path_test, # test data to create "result_fold": result_fold, # to save the predictions and labels } funcH.createDirIfNotExist(results_dir) funcH.createDirIfNotExist(models_dir) funcH.createDirIfNotExist(data_path_train) funcH.createDirIfNotExist(data_path_valid) funcH.createDirIfNotExist(data_path_test) funcH.createDirIfNotExist(result_fold) return path_dict