def load_usps(data_path = os.path.join(getVariableByComputerName("n2d_experiments"), 'usps', 'data')):
    createDirIfNotExist(data_path)

    file_name_tr = os.path.join(data_path, 'usps_train.jf')
    file_name_te = os.path.join(data_path, 'usps_test.jf')
    link_adr_path = 'https://raw.githubusercontent.com/cvjena/ITAL/master/data/usps_<trte>.jf'
    if not os.path.exists(file_name_tr):
        download_file(link_adr_path.replace("<trte>", "train"), save2path=data_path, savefilename='usps_train.jf')
        #os.system('wget http://www-i6.informatik.rwth-aachen.de/~keysers/usps_train.jf.gz -P %s' % data_path)
        download_file(link_adr_path.replace("<trte>", "test"), save2path=data_path, savefilename='usps_test.jf')
        #os.system('wget http://www-i6.informatik.rwth-aachen.de/~keysers/usps_test.jf.gz -P %s' % data_path)

    with open(file_name_tr) as f:
        data = f.readlines()
    data = data[1:-1]
    data = [list(map(float, line.split())) for line in data]
    data = np.array(data)
    data_train, labels_train = data[:, 1:], data[:, 0]

    with open(file_name_te) as f:
        data = f.readlines()
    data = data[1:-1]
    data = [list(map(float, line.split())) for line in data]
    data = np.array(data)
    data_test, labels_test = data[:, 1:], data[:, 0]

    x = np.concatenate((data_train, data_test)).astype('float64')
    y = np.concatenate((labels_train, labels_test))
    print('USPS samples', x.shape)
    return x, y
def load_pendigits(data_path = os.path.join(getVariableByComputerName("n2d_experiments"), 'pendigits', 'data')):
    createDirIfNotExist(data_path)
    file_name_tr = os.path.join(data_path, 'pendigits.tra')
    file_name_te = os.path.join(data_path, 'pendigits.tes')
    link_adr_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits<file_ending>'
    if not os.path.exists(file_name_tr):
        os.makedirs(data_path,  exist_ok=True)
        download_file(link_adr_path.replace("<file_ending>", ".tra"), save2path=data_path, savefilename='pendigits.tra')
        #os.system('wget https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra -P %s' % data_path)
        download_file(link_adr_path.replace("<file_ending>", ".tes"), save2path=data_path, savefilename='pendigits.tes')
        #os.system('wget https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tes -P %s' % data_path)
        download_file(link_adr_path.replace("<file_ending>", ".names"), save2path=data_path, savefilename='pendigits.names')
        #os.system('wget https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.names -P %s' % data_path)

    # load training data
    with open(file_name_tr) as file:
        data = file.readlines()
    data = [list(map(float, line.split(','))) for line in data]
    data = np.array(data).astype(np.float32)
    data_train, labels_train = data[:, :-1], data[:, -1]

    # load testing data
    with open(file_name_te) as file:
        data = file.readlines()
    data = [list(map(float, line.split(','))) for line in data]
    data = np.array(data).astype(np.float32)
    data_test, labels_test = data[:, :-1], data[:, -1]

    x = np.concatenate((data_train, data_test)).astype('float32')
    y = np.concatenate((labels_train, labels_test))
    x /= 100.
    y = y.astype('int')
    return x, y
Пример #3
0
def get_create_folders(params_dict):
    data_path_base = params_dict["data_path_base"]

    data_ident = 'data_' + params_dict["data_ident"]
    base_dir = funcH.getVariableByComputerName('base_dir')  # xx/DataPath or xx/DataFolder
    results_dir = os.path.join(base_dir, 'sup', 'results_mi' + str(params_dict["model_id"]))
    models_dir = os.path.join(base_dir, 'sup', 'models_mi' + str(params_dict["model_id"]))
    data_params_folder = os.path.join(base_dir, 'sup', 'data_mi', data_ident)

    data_path_base = os.path.join(base_dir, data_path_base, "imgs")
    result_fold = os.path.join(base_dir, 'sup', 'preds_' + params_dict["modelName"], 'pred_' + params_dict["exp_ident"])

    path_dict = {
        "results": results_dir,  # folder="~/DataFolder/sup/results_mi1"
        "models": models_dir,
        "data_base": data_path_base,  # original path of data to load
        "data_params_folder": data_params_folder,  # data params folder
        "result_fold": result_fold,  # to save the predictions and labels
    }

    funcH.createDirIfNotExist(results_dir)
    funcH.createDirIfNotExist(models_dir)
    funcH.createDirIfNotExist(data_params_folder)
    funcH.createDirIfNotExist(result_fold)

    return path_dict
def load_har(data_path = os.path.join(getVariableByComputerName("n2d_experiments"), 'har', 'data')):
    # load this dataset this way ??
    # https://pypi.org/project/kcc2020-tutorial-HAR-dataset/
    # entire_dataset = load_har_all()
    createDirIfNotExist(data_path)
    fold_train = os.path.join(data_path, 'train')
    fold_test = os.path.join(data_path, 'test')
    createDirIfNotExist(fold_train)
    createDirIfNotExist(fold_test)
    fname_train_x = os.path.join(fold_train, 'X_train.txt')
    fname_train_y = os.path.join(fold_train, 'y_train.txt')
    fname_test_x = os.path.join(fold_test, 'X_test.txt')
    fname_test_y = os.path.join(fold_test, 'y_test.txt')

    # https://github.com/mollybostic/cleaning-data-assignment/tree/master/UCI%20HAR%20Dataset
    # for windows = https://sourceforge.net/projects/gnuwin32/files/wget/1.11.4-1/wget-1.11.4-1-setup.exe/download
    # https://stackoverflow.com/questions/29113456/wget-not-recognized-as-internal-or-external-command

    link_adr_path = 'https://raw.githubusercontent.com/mollybostic/cleaning-data-assignment/master/UCI%20HAR%20Dataset/<trte>/<Xy>_<trte>.txt'
    if not os.path.isfile(fname_train_x):
        print('downloading X_train.txt(66.0MB)')
        download_file(link_adr_path.replace("<trte>", "train").replace("<Xy>", "X"), save2path=fold_train, savefilename='X_train.txt')
        #os.system("wget --no-verbose 'https://raw.githubusercontent.com/mollybostic/cleaning-data-assignment/master/UCI HAR Dataset/train/X_train.txt' -P %s" % fold_train)
        print('downloading y_train.txt(14.7kB)')
        download_file(link_adr_path.replace("<trte>", "train").replace("<Xy>", "y"), save2path=fold_train, savefilename='y_train.txt')
        #os.system("wget --no-verbose 'https://raw.githubusercontent.com/mollybostic/cleaning-data-assignment/master/UCI HAR Dataset/train/y_train.txt' -P %s" % fold_train)
        print('downloading X_test.txt(26.5MB)')
        download_file(link_adr_path.replace("<trte>", "test").replace("<Xy>", "X"), save2path=fold_test, savefilename='X_test.txt')
        #os.system("wget --no-verbose 'https://raw.githubusercontent.com/mollybostic/cleaning-data-assignment/master/UCI HAR Dataset/test/X_test.txt' -P %s" % fold_test)
        print('downloading y_test.txt(5.9kB)')
        download_file(link_adr_path.replace("<trte>", "test").replace("<Xy>", "y"), save2path=fold_test, savefilename='y_test.txt')
        #os.system("wget --no-verbose 'https://raw.githubusercontent.com/mollybostic/cleaning-data-assignment/master/UCI HAR Dataset/test/y_test.txt' -P %s" % fold_test)

    x_train = pd.read_csv(fname_train_x, sep=r'\s+', header=None)
    y_train = pd.read_csv(fname_train_y, header=None)
    x_test = pd.read_csv(fname_test_x, sep=r'\s+', header=None)
    y_test = pd.read_csv(fname_test_y, header=None)
    x = np.concatenate((x_train, x_test))
    y = np.concatenate((y_train, y_test))
    # # labels start at 1 so..
    y = y - 1
    y = y.reshape((y.size,))
    y_names = {0: 'Walking', 1: 'Upstairs', 2: 'Downstairs', 3: 'Sitting', 4: 'Standing', 5: 'Laying', }
    os.error("not implemented")
    return x, y, y_names
Пример #5
0
def get_args(argv):
    global debug_string_out
    parser = argparse.ArgumentParser(
        description='(Not Too) Deep',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--dataset', default='mnist', )
    parser.add_argument('--ae_weights', default=None)
    parser.add_argument('--experiments_folder_base', default=funcH.getVariableByComputerName("n2d_experiments"))
    parser.add_argument("--mode", default='client')
    parser.add_argument("--port", default=52162)
    parser.add_argument('--gpu', default=0, )
    parser.add_argument('--n_clusters', default=10, type=int)
    parser.add_argument('--batch_size', default=256, type=int)
    parser.add_argument('--pretrain_epochs', default=1000, type=int)
    parser.add_argument('--umap_dim', default=2, type=int)
    parser.add_argument('--umap_neighbors', default=10, type=int)
    parser.add_argument('--umap_min_dist', default="0.00", type=str)
    parser.add_argument('--umap_metric', default='euclidean', type=str)
    parser.add_argument('--cluster', default='GMM', type=str)
    parser.add_argument('--manifold_learner', default='UMAP', type=str)
    parser.add_argument('--visualize', default=False, type=bool)
    parser.add_argument('--rerun_last_plots', default=False, type=bool)
    args = funcH._parse_args(parser, argv, print_args=True)
    debug_string_out = funcH.print_and_add('-' * 80)

    experiment_names_and_folders = {
        "exp_date_str": str(datetime.now().strftime("%Y%m%d_")).replace('-', ''),  # %M%S,
        "exp_base_str": "_".join([args.dataset, "c" + str(args.cluster)+ str(args.n_clusters), "e" + str(args.pretrain_epochs)]),
        "folder_umap_data": os.path.join(args.experiments_folder_base, "exported_manifolds"),
        "folder_ae_weights": os.path.join(args.experiments_folder_base, "weights"),
    }
    experiment_names_and_folders["exp_extended"] = experiment_names_and_folders["exp_base_str"] + "_" + "_".join([args.manifold_learner + "ud" + str(args.umap_dim), "un" + str(args.umap_neighbors)])
    experiment_names_and_folders["folder_experiment"] = os.path.join(args.experiments_folder_base, args.dataset,
                                           experiment_names_and_folders["exp_date_str"] + experiment_names_and_folders["exp_extended"])
    experiment_names_and_folders["file_name_ae_weights_base"] = "aew_" + "_".join([args.dataset, "c" + str(args.n_clusters), "e" + str(args.pretrain_epochs)])
    experiment_names_and_folders["file_name_ae_weights_full"] = os.path.join(experiment_names_and_folders["folder_ae_weights"], experiment_names_and_folders["file_name_ae_weights_base"] + '.npy')
    experiment_names_and_folders["file_name_umap_data_base"] = "ulp" + experiment_names_and_folders["exp_extended"]
    experiment_names_and_folders["file_name_umap_data_full"] = os.path.join(experiment_names_and_folders["folder_umap_data"], experiment_names_and_folders["file_name_umap_data_base"] + '.npy')
    experiment_names_and_folders["file_name_arguments_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'args_' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.txt')
    experiment_names_and_folders["file_name_ae_params_text_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'args_autoencode_' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.txt')
    experiment_names_and_folders["file_name_plot_fig_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'plot_' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '_<plot_id>.png')
    experiment_names_and_folders["file_name_plot_csv_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'csv_' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.csv')
    experiment_names_and_folders["file_name_clusters_after_manifold_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'clusters_after_manifold-' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.txt')
    experiment_names_and_folders["file_name_clusters_before_manifold_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'clusters_before_manifold-' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.txt')
    experiment_names_and_folders["file_name_debug_string_out_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'debug_string_out-' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.txt')
    experiment_names_and_folders["file_name_result_csv_file_full"] = os.path.join(args.experiments_folder_base, 'results.csv')
    experiment_names_and_folders["file_name_data_before_manifold"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'data_' + experiment_names_and_folders["exp_extended"] + '_before.npz')
    experiment_names_and_folders["file_name_data_after_manifold"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'data_' + experiment_names_and_folders["exp_extended"] + '_after.npz')
    experiment_names_and_folders["file_name_cluster_obj"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'cluster_obj_' + experiment_names_and_folders["exp_extended"] + '_<bef_aft>.dictionary')
    experiment_names_and_folders["file_name_silhouette_results"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'silhouette_results_' + experiment_names_and_folders["exp_extended"] + '_<bef_aft>.npy')
    experiment_names_and_folders["file_name_results"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'results_' + experiment_names_and_folders["exp_extended"] + '.dictionary')

    args.experiment_names_and_folders = experiment_names_and_folders

    # 4 folders folder_{experiment, umap_data, ae_weights}
    funcH.createDirIfNotExist(experiment_names_and_folders["folder_experiment"])
    funcH.createDirIfNotExist(experiment_names_and_folders["folder_umap_data"])
    funcH.createDirIfNotExist(experiment_names_and_folders["folder_ae_weights"])

    with open(experiment_names_and_folders["file_name_arguments_full"], 'w') as f:
        f.write("\n".join(argv))
    return args
Пример #6
0
def create_data_folder(userIDTest,
                       userIDValid,
                       nos,
                       to_folder,
                       base_dir="/home/doga/DataFolder"):
    #  base_dir = funcH.getVariableByComputerName('base_dir')  # xx/DataPath or xx/DataFolder
    data_path_base = "neuralNetHandImages_nos" + str(nos) + "_rs224"
    data_path = os.path.join(base_dir, data_path_base,
                             "imgs")  # original path of data to load
    data_ident = "te{:d}_va{:d}_nos{:d}".format(userIDTest, userIDValid, nos)
    train_path = os.path.join(to_folder, "conv_data_" + data_ident, 'data_tr')
    valid_path = os.path.join(to_folder, "conv_data_" + data_ident, 'data_va')
    test_path = os.path.join(to_folder, "conv_data_" + data_ident, 'data_te')

    createDirIfNotExist(train_path)
    createDirIfNotExist(valid_path)
    createDirIfNotExist(test_path)

    cnt_table_fileName = os.path.join(
        to_folder, "conv_data_" + data_ident, "cnt_table" +
        "_te{:d}_va{:d}_nos{:d}".format(userIDTest, userIDValid, nos) + ".csv")
    targets = getFolderList(dir2Search=data_path, sortList=True).tolist()
    table_rows = targets.copy()
    table_rows.append("total")
    cnt_table = pd.DataFrame(index=table_rows,
                             columns=["train", "validation", "test", "total"])
    for col in cnt_table.columns:
        cnt_table[col].values[:] = 0

    if os.path.isdir(train_path) and os.path.isdir(
            valid_path) and os.path.isdir(test_path):
        rmtree(train_path, ignore_errors=True)
        rmtree(valid_path, ignore_errors=True)
        rmtree(test_path, ignore_errors=True)

    create_sub_folders(targets, train_path)
    create_sub_folders(targets, valid_path)
    create_sub_folders(targets, test_path)
    for col in cnt_table.columns:
        cnt_table[col].values[:] = 0

    spaces_list = []
    for t in targets:
        print(f"Start copying target {t} -->")
        source_path = os.path.join(data_path, t)
        samples = getFileList(dir2Search=source_path, endString=".png")
        # according to user_id_dict
        cnt_table["total"][t] = len(samples)
        cnt_table["total"]["total"] += len(samples)
        train_samples = []
        for s in samples:
            sample_dict = s.split(sep="_")
            # <3 signID><1 userID><2 repID>
            # int_id = int(sample_dict[1])
            # user_id = ((int_id - int_id.__mod__(100))/100).__mod__(10)
            # user_id_str = sample_dict[1][3]
            user_id_int = int(sample_dict[1][3])
            # if user_id_dict["valid"] == user_id_int:
            #    copyfile(os.path.join(source_path, s), os.path.join(valid_path, t, s))
            #    cnt_table["validation"][t] += 1
            if userIDTest == user_id_int:
                copyfile(os.path.join(source_path, s),
                         os.path.join(test_path, t, s))
                cnt_table["test"][t] += 1
            elif userIDValid == user_id_int:
                copyfile(os.path.join(source_path, s),
                         os.path.join(valid_path, t, s))
                cnt_table["validation"][t] += 1
            else:
                copyfile(os.path.join(source_path, s),
                         os.path.join(train_path, t, s))
                cnt_table["train"][t] += 1

        cnt_table["train"]["total"] += cnt_table["train"][t]
        cnt_table["validation"]["total"] += cnt_table["validation"][t]
        cnt_table["test"]["total"] += cnt_table["test"][t]
        print(
            f"Copied {t} --> train({cnt_table['train'][t]}),valid({cnt_table['validation'][t]}),test({cnt_table['test'][t]})"
        )

    pd.DataFrame.to_csv(cnt_table, path_or_buf=cnt_table_fileName)
    print('\n'.join(map(str, spaces_list)))
    samples_list_filename = cnt_table_fileName.replace(".csv", "_sl.txt")
    with open(samples_list_filename, 'w') as f:
        for i, item in enumerate(spaces_list):
            f.write("%s - %s\n" % (str(targets[i]), str(item)))

    return data_ident
Пример #7
0
def create_sub_folders(targets, dir_path):
    """Creates empty folders which have the same name as given targets in dir_path"""
    for t in targets:
        createDirIfNotExist(os.path.join(dir_path, t))
Пример #8
0
    else:
        ae = modelLoader.loadModel("model_tex")

    prediction = ae.predict(train_images[0:199,:,:,:], verbose=1, batch_size=100)
    x =prediction[0].reshape(28,28)
    plt.imshow(x)
    plt.show()
else:
    exp_name = 'cnnAE'
    results_dir = funcH.getVariableByComputerName('results_dir')
    outdir = os.path.join(results_dir, 'results', exp_name)

    csv_name = os.path.join(results_dir, 'epochs') + os.sep + exp_name + '.csv'
    model_name = os.path.join(results_dir, 'models') + os.sep + exp_name + '.h5'

    funcH.createDirIfNotExist(os.path.join(results_dir, 'epochs'))
    funcH.createDirIfNotExist(os.path.join(results_dir, 'models'))
    funcH.createDirIfNotExist(outdir)

    checkpointer = ModelCheckpoint(filepath=model_name, verbose=0, save_best_only=False, period=1)
    csv_logger = CSVLogger(csv_name, append=True, separator=';')
    #ES = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=50, verbose=0, mode='auto')
    #callbacks = [csv_logger, ES, checkpointer]

    feat_set, labels_all, detailedLabels_all = dataLoader.loadData_nnVidImages('/home/dg/DataPath/bdData')
    non_zero_labels = labels_all[np.where(labels_all)]

    ae = modelLoader.modelLoad_KHS()
    ae_tester = modelLoader.modelLoad_KHS_Clusters()
    ae.compile(optimizer="adam", loss="mse")
def main(argv):
    np.set_printoptions(formatter={"float_kind": lambda x: "%g" % x})

    params_dict = parseArgs(argv)
    numOfSigns = params_dict["numOfSigns"]  # 11 or 41
    clusterModel = params_dict["clusterModel"]  # 'KMeans', 'GMM_diag', 'Spectral'
    params_dict["hostName"] = socket.gethostname()
    initialLabelVec, expNameEnd = decode_initial_label_param(params_dict["initialLabel"])
    clusterLabelUpdateInterval = params_dict["clusterLabelUpdateInterval"]

    print('you are running this train function on = <', params_dict["hostName"], '>')

    input_initial_resize, input_size, batch_size, num_workers = initSomeVals(params_dict)
    train_data_transform, valid_data_transform = getTransformFuncs(input_size, input_initial_resize)

    base_dir = funcH.getVariableByComputerName('base_dir')  # dataPath and dataFolder
    data_dir = funcH.getVariableByComputerName('data_dir')  # bdData
    results_dir = funcH.getVariableByComputerName('results_dir').replace("bdResults", "dcResults")
    labelsDir = funcH.getVariableByComputerName('results_dir').replace("bdResults", "dcLabels")
    modelsDir = os.path.join(base_dir, 'dcModels')
    nnVidsDir = os.path.join(base_dir, 'neuralNetHandVideos_' + str(numOfSigns))

    expName = params_dict["modelName"] + '_' + \
              params_dict["clusterModel"] + \
              '_pd' + str(params_dict["posterior_dim"]) + \
              '_clui' + str(params_dict["clusterLabelUpdateInterval"]) + \
              '_' + str(numOfSigns) + \
              expNameEnd
    labelSaveFolder = os.path.join(labelsDir, expName)
    resultMatFile = os.path.join(results_dir, 'rMF_' + expName)

    funcH.createDirIfNotExist(results_dir)
    funcH.createDirIfNotExist(labelsDir)
    funcH.createDirIfNotExist(modelsDir)
    funcH.createDirIfNotExist(labelSaveFolder)

    epochFr, epochTo = setEpochBounds(labelSaveFolder, params_dict["epochs"], params_dict["appendEpochBinary"])

    train_dataset = HandShapeDataset(root_dir=nnVidsDir, istrain=True, transform=train_data_transform, datasetname='nnv')
    val_dataset = HandShapeDataset(root_dir=nnVidsDir, istrain=False, transform=valid_data_transform, datasetname='nnv')

    num_classes = np.unique(train_dataset.labels).size

    print('trainCnt = ', len(train_dataset))
    print('valCnt = ', len(val_dataset))

    model, optimizer, updatedModelFile = getModel(params_dict, modelsDir, expName)

    num_ftrs = model.fc.in_features
    print('num_classes = ', num_classes, ', num_ftrs = ', num_ftrs, flush=True)

    epochStartTime = time.time()

    dsLoad_train_train = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    dsLoad_train_featExtract = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    model.eval()

    #  evaluate the model to extract
    #  trAccInit : to save as initial training accuracy
    #  featTrInit : features to cluster, also saved as result features in -saveFeatsExtracted-
    #  labelsTrInit :
    #  predictionsTrInit :
    trAccInit, _, featTrInit, labelsTrInit, predictionsTrInit = runValidDs(model, dsLoad_train_featExtract, return_feats=True, layerSize=num_ftrs)

    saveFeatsExtracted(data_dir, epochFr, params_dict["modelName"], expName, featTrInit, labelsTrInit, predictionsTrInit)

    labelSaveFileName = labelSaveFolder + os.sep + 'labels_{:03d}.npz'.format(epochFr)
    predClusters, resultRow = iterate_1(featTrInit, labelsTrInit, predictionsTrInit, params_dict["posterior_dim"],
                                        labelSaveFileName, epochFr-1, epochTo, trAccInit,
                                        epochStartTime, clusterModel=clusterModel, initialLabelVec=initialLabelVec)

    train_dataset = updateTrainLabels(train_dataset, clusterLabelUpdateInterval, epochFr, predClusters=predClusters, initialLabelVec=initialLabelVec)

    resultMat = []
    resultMat = resultMat + resultRow.tolist()
    if not os.path.isfile(resultMatFile):
        np.savetxt(resultMatFile, np.array(resultRow).reshape(1, -1), fmt='%4.3f', delimiter='*', newline=os.linesep,
               header='ep * tr_acc_epoch * nmi_lab * nmi_lab_nz * acc_lab * acc_lab_nz * nmi_pred * nmi_pred_nz * acc_pred * acc_pred_nz',
               footer='', comments='', encoding=None)
    else:
        f = open(resultMatFile, 'a')
        np.savetxt(f, np.array(resultRow).reshape(1, -1), fmt='%4.3f', delimiter='*', newline=os.linesep, header='', footer='', comments='', encoding=None)
        f.close()


    for ep in range(epochFr, epochTo):
        model.train()  # Set model to training mode
        epochStartTime = time.time()
        _, _ = runTrainDs(model, optimizer, dsLoad_train_train)

        model.eval()
        tr_acc_epoch, _, features_avgPool, labels_avgPool, predictionsTr = \
            runValidDs(model, dsLoad_train_featExtract, return_feats=True, layerSize=num_ftrs)

        labelSaveFileName = labelSaveFolder + os.sep + 'labels_{:03d}.npz'.format(ep+1)
        predClusters, resultRow = iterate_1(features_avgPool, labelsTrInit, predictionsTr,
                                            params_dict["posterior_dim"], labelSaveFileName, ep, epochTo, tr_acc_epoch,
                                            epochStartTime, clusterModel=clusterModel, initialLabelVec=initialLabelVec)
        resultMat = resultMat + resultRow.tolist()

        train_dataset = updateTrainLabels(train_dataset, clusterLabelUpdateInterval, ep+1, predClusters=predClusters)

        saveFeatsExtracted(data_dir, ep, params_dict["modelName"], expName, features_avgPool, labelsTrInit, predictionsTr)
        saveToResultMatFile(resultMatFile, resultRow)
        torch.save(model, f=updatedModelFile)
Пример #10
0
def create_dataset(path_dict, user_id_dict, params_dict):
    data_path = path_dict["data_base"]  # original path of data to load
    data_params_folder = path_dict["data_params_folder"]  # train data to create
    cnt_table_fileName = os.path.join(os.path.abspath(os.path.join(path_dict["data_params_folder"], os.pardir)), "cnt_table" +
                                      params_dict["exp_ident"] + ".csv")

    targets, cnt_vec_all = read_data(data_path)

    table_rows = targets.copy()
    table_rows.append("total")
    cnt_table = pd.DataFrame(index=table_rows, columns=["train", "validation", "test", "total"])
    for col in cnt_table.columns:
        cnt_table[col].values[:] = 0

    if os.path.isdir(data_params_folder) and os.path.isfile(cnt_table_fileName):
        try:
            cnt_table = pd.read_csv(cnt_table_fileName, header=0, sep="*", names=["train", "validation", "test", "total"])
            return cnt_table
        except:
            rmtree(data_params_folder, ignore_errors=True)

    funcH.createDirIfNotExist(data_params_folder)
    for col in cnt_table.columns:
        cnt_table[col].values[:] = 0

    np.random.seed(seed=params_dict["randomSeed"])
    spaces_list = []
    for t in targets:
        print(f"Start extracting target {t} -->")
        source_path = os.path.join(data_path, t)
        samples = os.listdir(source_path)
        #according to user_id_dict
        cnt_table["total"][t] = len(samples)
        cnt_table["total"]["total"] += len(samples)
        train_samples = []
        for s in samples:
            sample_dict = s.split(sep="_")
            # <3 signID><1 userID><2 repID>
            # int_id = int(sample_dict[1])
            # user_id = ((int_id - int_id.__mod__(100))/100).__mod__(10)
            # user_id_str = sample_dict[1][3]
            user_id_int = int(sample_dict[1][3])
            #if user_id_dict["valid"] == user_id_int:
            #    copyfile(os.path.join(source_path, s), os.path.join(valid_path, t, s))
            #    cnt_table["validation"][t] += 1

            #### get hog, skel and surf norm of the sample

            if user_id_dict["test"] == user_id_int:
                # copyfile(os.path.join(source_path, s), os.path.join(test_path, t, s))
                #### add to test group
                cnt_table["test"][t] += 1
            else:
                # copyfile(os.path.join(source_path, s), os.path.join(train_path, t, s))
                #### add to train group
                # train_samples.append(os.path.join(train_path, t, s))
                cnt_table["train"][t] += 1
        # deal with validation samples
        num_of_train_samples = len(train_samples)
        perm_list = np.random.permutation(num_of_train_samples)
        spaces = np.array(np.floor(np.linspace(0.0, num_of_train_samples, num=6)), dtype=int)
        fr, to = spaces[user_id_dict["cross_valid_id"]-1], spaces[user_id_dict["cross_valid_id"]]
        spaces_list.append(list(np.array([fr, to])) + list([-1])+ list(perm_list[fr:to]))

        #### move samples fr:to  from train to valid

        # for i in range(fr, to):
            # sample_to_move = train_samples[perm_list[i]]
            # sample_new_name = sample_to_move.replace(train_path, valid_path)
            # os.rename(sample_to_move, sample_new_name)
            # cnt_table["train"][t] -= 1
            # cnt_table["validation"][t] += 1

        cnt_table["train"]["total"] += cnt_table["train"][t]
        cnt_table["validation"]["total"] += cnt_table["validation"][t]
        cnt_table["test"]["total"] += cnt_table["test"][t]
        print(f"Extracted {t} --> train({cnt_table['train'][t]}),valid,({cnt_table['validation'][t]})test({cnt_table['test'][t]})")

    pd.DataFrame.to_csv(cnt_table, path_or_buf=cnt_table_fileName)
    print('\n'.join(map(str, spaces_list)))
    samples_list_filename = cnt_table_fileName.replace(".csv", "_sl.txt")
    with open(samples_list_filename, 'w') as f:
        for i, item in enumerate(spaces_list):
            f.write("%s - %s\n" % (str(targets[i]), str(item)))

    return cnt_table
Пример #11
0
def get_create_folders(params_dict):
    data_path_base = params_dict["data_path_base"]

    data_ident = 'data_' + params_dict["data_ident"]
    base_dir = funcH.getVariableByComputerName(
        'base_dir')  # xx/DataPath or xx/DataFolder
    results_dir = os.path.join(base_dir, 'sup',
                               'results_' + params_dict["modelName"])
    models_dir = os.path.join(base_dir, 'sup',
                              'models_' + params_dict["modelName"])
    data_path_fill = os.path.join(base_dir, 'sup', 'data', data_ident)

    exp_ident_str = 'rs' + str(params_dict["randomSeed"]).zfill(2)
    data_path_train = os.path.join(
        data_path_fill, data_path_base + '_' + exp_ident_str + '_tr')
    data_path_valid = os.path.join(
        data_path_fill, data_path_base + '_' + exp_ident_str + '_va')
    data_path_test = os.path.join(data_path_fill,
                                  data_path_base + '_' + exp_ident_str + '_te')
    data_path_base = os.path.join(base_dir, data_path_base, "imgs")
    result_fold = os.path.join(base_dir, 'sup',
                               'preds_' + params_dict["modelName"],
                               'pred_' + params_dict["exp_ident"])

    path_dict = {
        "results": results_dir,  # folder="~/DataFolder/sup/results"
        "models": models_dir,
        "data_base": data_path_base,  # original path of data to load
        "train": data_path_train,  # train data to create
        "valid": data_path_valid,  # valid data to create
        "test": data_path_test,  # test data to create
        "result_fold": result_fold,  # to save the predictions and labels
    }

    funcH.createDirIfNotExist(results_dir)
    funcH.createDirIfNotExist(models_dir)
    funcH.createDirIfNotExist(data_path_train)
    funcH.createDirIfNotExist(data_path_valid)
    funcH.createDirIfNotExist(data_path_test)
    funcH.createDirIfNotExist(result_fold)

    return path_dict