def validate_downloaded_data(data_path): if not os.path.isdir(data_path): print("data_path({:s}) doesnt exist".format(data_path)) return False, [], [] img_counts_matches = True targets = funcH.getFolderList(dir2Search=data_path, sortList=True).tolist() csv_file = funcH.getFileList(dir2Search=data_path, startString="cnt_table", endString=".csv") csv_file_exist = csv_file != [] if csv_file_exist: cnt_pd = pd.read_csv(filepath_or_buffer=os.path.join( data_path, csv_file[0]), delimiter=',') file_targets = cnt_pd[cnt_pd.columns[0]].values[:-1] file_counts = cnt_pd[cnt_pd.columns[1]].values[:-1] folder_counts = np.zeros((len(targets), ), dtype=int) for i, t in enumerate(targets): source_path = os.path.join(data_path, t) samples = os.listdir(source_path) folder_counts[i] = len(samples) if csv_file_exist: img_counts_matches = img_counts_matches and (folder_counts[i] == file_counts[i]) assert (file_targets[i] == t), "{:s}!={:s}".format( file_targets[i], t) return img_counts_matches, targets, folder_counts
def get_last_epoch_completed(out_folder): epoch_out_img_list = funcH.getFileList(out_folder, startString="output_te", endString=".png", sortList=False) ep_fr = len(epoch_out_img_list) return ep_fr
def count_data_in_folder(data_path): if not os.path.isdir(data_path): print("data_path({:s}) doesnt exist".format(data_path)) return [], [] targets = getFolderList(dir2Search=data_path, sortList=True) img_cnt = np.zeros(np.shape(targets)) for i, t in enumerate(targets): source_path = os.path.join(data_path, t) samples = getFileList(dir2Search=source_path, endString=".png") img_cnt[i] = len(samples) return targets, img_cnt
def create_data_folder(userIDTest, userIDValid, nos, to_folder, base_dir="/home/doga/DataFolder"): # base_dir = funcH.getVariableByComputerName('base_dir') # xx/DataPath or xx/DataFolder data_path_base = "neuralNetHandImages_nos" + str(nos) + "_rs224" data_path = os.path.join(base_dir, data_path_base, "imgs") # original path of data to load data_ident = "te{:d}_va{:d}_nos{:d}".format(userIDTest, userIDValid, nos) train_path = os.path.join(to_folder, "conv_data_" + data_ident, 'data_tr') valid_path = os.path.join(to_folder, "conv_data_" + data_ident, 'data_va') test_path = os.path.join(to_folder, "conv_data_" + data_ident, 'data_te') createDirIfNotExist(train_path) createDirIfNotExist(valid_path) createDirIfNotExist(test_path) cnt_table_fileName = os.path.join( to_folder, "conv_data_" + data_ident, "cnt_table" + "_te{:d}_va{:d}_nos{:d}".format(userIDTest, userIDValid, nos) + ".csv") targets = getFolderList(dir2Search=data_path, sortList=True).tolist() table_rows = targets.copy() table_rows.append("total") cnt_table = pd.DataFrame(index=table_rows, columns=["train", "validation", "test", "total"]) for col in cnt_table.columns: cnt_table[col].values[:] = 0 if os.path.isdir(train_path) and os.path.isdir( valid_path) and os.path.isdir(test_path): rmtree(train_path, ignore_errors=True) rmtree(valid_path, ignore_errors=True) rmtree(test_path, ignore_errors=True) create_sub_folders(targets, train_path) create_sub_folders(targets, valid_path) create_sub_folders(targets, test_path) for col in cnt_table.columns: cnt_table[col].values[:] = 0 spaces_list = [] for t in targets: print(f"Start copying target {t} -->") source_path = os.path.join(data_path, t) samples = getFileList(dir2Search=source_path, endString=".png") # according to user_id_dict cnt_table["total"][t] = len(samples) cnt_table["total"]["total"] += len(samples) train_samples = [] for s in samples: sample_dict = s.split(sep="_") # <3 signID><1 userID><2 repID> # int_id = int(sample_dict[1]) # user_id = ((int_id - int_id.__mod__(100))/100).__mod__(10) # user_id_str = sample_dict[1][3] user_id_int = int(sample_dict[1][3]) # if user_id_dict["valid"] == user_id_int: # copyfile(os.path.join(source_path, s), os.path.join(valid_path, t, s)) # cnt_table["validation"][t] += 1 if userIDTest == user_id_int: copyfile(os.path.join(source_path, s), os.path.join(test_path, t, s)) cnt_table["test"][t] += 1 elif userIDValid == user_id_int: copyfile(os.path.join(source_path, s), os.path.join(valid_path, t, s)) cnt_table["validation"][t] += 1 else: copyfile(os.path.join(source_path, s), os.path.join(train_path, t, s)) cnt_table["train"][t] += 1 cnt_table["train"]["total"] += cnt_table["train"][t] cnt_table["validation"]["total"] += cnt_table["validation"][t] cnt_table["test"]["total"] += cnt_table["test"][t] print( f"Copied {t} --> train({cnt_table['train'][t]}),valid({cnt_table['validation'][t]}),test({cnt_table['test'][t]})" ) pd.DataFrame.to_csv(cnt_table, path_or_buf=cnt_table_fileName) print('\n'.join(map(str, spaces_list))) samples_list_filename = cnt_table_fileName.replace(".csv", "_sl.txt") with open(samples_list_filename, 'w') as f: for i, item in enumerate(spaces_list): f.write("%s - %s\n" % (str(targets[i]), str(item))) return data_ident
def create_dataset(path_dict, user_id_dict, params_dict): data_path = path_dict["data_base"] # original path of data to load train_path = path_dict["train"] # train data to create valid_path = path_dict["valid"] # valid data to create test_path = path_dict["test"] # test data to create cnt_table_fileName = os.path.join( os.path.abspath(os.path.join(path_dict["train"], os.pardir)), "cnt_table" + params_dict["exp_ident"] + ".csv") img_cnt_ok_all, targets, cnt_vec_all = validate_downloaded_data(data_path) if not img_cnt_ok_all: print("download the data again!!") sys.exit(21) table_rows = targets.copy() table_rows.append("total") cnt_table = pd.DataFrame(index=table_rows, columns=["train", "validation", "test", "total"]) for col in cnt_table.columns: cnt_table[col].values[:] = 0 if os.path.isdir(train_path) and os.path.isdir( valid_path) and os.path.isdir(test_path): try: targets_tr, img_cnt_tr = count_data_in_folder(train_path) cnt_table["train"].values[:-1] = img_cnt_tr targets_va, img_cnt_va = count_data_in_folder(valid_path) cnt_table["validation"].values[:-1] = img_cnt_va targets_te, img_cnt_te = count_data_in_folder(test_path) cnt_table["test"].values[:-1] = img_cnt_te cnt_table[ "total"].values[:-1] = img_cnt_tr + img_cnt_va + img_cnt_te cnt_table[-1:].values[:] = np.sum(cnt_table[:-1].values[:], axis=0) if np.sum(cnt_vec_all - img_cnt_tr - img_cnt_va - img_cnt_te) == 0: return cnt_table else: rmtree(train_path, ignore_errors=True) rmtree(valid_path, ignore_errors=True) rmtree(test_path, ignore_errors=True) except: rmtree(train_path, ignore_errors=True) rmtree(valid_path, ignore_errors=True) rmtree(test_path, ignore_errors=True) create_sub_folders(targets, train_path) create_sub_folders(targets, valid_path) create_sub_folders(targets, test_path) for col in cnt_table.columns: cnt_table[col].values[:] = 0 np.random.seed(seed=params_dict["randomSeed"]) torch.random.manual_seed(params_dict["randomSeed"]) spaces_list = [] for t in targets: print(f"Start copying target {t} -->") source_path = os.path.join(data_path, t) samples = funcH.getFileList(dir2Search=source_path, endString=".png") #according to user_id_dict cnt_table["total"][t] = len(samples) cnt_table["total"]["total"] += len(samples) train_samples = [] for s in samples: sample_dict = s.split(sep="_") # <3 signID><1 userID><2 repID> # int_id = int(sample_dict[1]) # user_id = ((int_id - int_id.__mod__(100))/100).__mod__(10) # user_id_str = sample_dict[1][3] user_id_int = int(sample_dict[1][3]) #if user_id_dict["valid"] == user_id_int: # copyfile(os.path.join(source_path, s), os.path.join(valid_path, t, s)) # cnt_table["validation"][t] += 1 if user_id_dict["test"] == user_id_int: copyfile(os.path.join(source_path, s), os.path.join(test_path, t, s)) cnt_table["test"][t] += 1 elif user_id_dict["cross_valid_id"] is not None and user_id_dict[ "valid"] is None: copyfile(os.path.join(source_path, s), os.path.join(train_path, t, s)) train_samples.append(os.path.join(train_path, t, s)) cnt_table["train"][t] += 1 elif user_id_dict["cross_valid_id"] is None and user_id_dict[ "valid"] == user_id_int: copyfile(os.path.join(source_path, s), os.path.join(valid_path, t, s)) cnt_table["validation"][t] += 1 elif user_id_dict["cross_valid_id"] is None: copyfile(os.path.join(source_path, s), os.path.join(train_path, t, s)) cnt_table["train"][t] += 1 # deal with validation samples if user_id_dict["cross_valid_id"] is not None and user_id_dict[ "valid"] is None: num_of_train_samples = len(train_samples) perm_list = np.random.permutation(num_of_train_samples) spaces = np.array(np.floor( np.linspace(0.0, num_of_train_samples, num=6)), dtype=int) fr, to = spaces[user_id_dict["cross_valid_id"] - 1], spaces[user_id_dict["cross_valid_id"]] spaces_list.append( list(np.array([fr, to])) + list([-1]) + list(perm_list[fr:to])) for i in range(fr, to): sample_to_move = train_samples[perm_list[i]] sample_new_name = sample_to_move.replace( train_path, valid_path) os.rename(sample_to_move, sample_new_name) cnt_table["train"][t] -= 1 cnt_table["validation"][t] += 1 cnt_table["train"]["total"] += cnt_table["train"][t] cnt_table["validation"]["total"] += cnt_table["validation"][t] cnt_table["test"]["total"] += cnt_table["test"][t] print( f"Copied {t} --> train({cnt_table['train'][t]}),valid,({cnt_table['validation'][t]})test({cnt_table['test'][t]})" ) pd.DataFrame.to_csv(cnt_table, path_or_buf=cnt_table_fileName) print('\n'.join(map(str, spaces_list))) samples_list_filename = cnt_table_fileName.replace(".csv", "_sl.txt") with open(samples_list_filename, 'w') as f: for i, item in enumerate(spaces_list): f.write("%s - %s\n" % (str(targets[i]), str(item))) return cnt_table