예제 #1
0
def init():
    global debug_string_out
    debug_string_out.clear()
    plt.style.use(['seaborn-white', 'seaborn-paper'])
    sns.set_context("paper", font_scale=1.3)

    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # str(sys.argv[2])
    os.environ['PYTHONHASHSEED'] = '0'
    os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'

    rn.seed(0)
    try:
        tf.set_random_seed(seed=0)
    except:
        tf.random.set_seed(seed=0)
    np.random.seed(0)

    if len(K.tensorflow_backend._get_available_gpus()) > 0:
        debug_string_out = funcH.print_and_add("Using GPU", debug_string_out)
        session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
                                      inter_op_parallelism_threads=1,
                                      )
        sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
        K.set_session(sess)
    try:
        from MulticoreTSNE import MulticoreTSNE as TSNE
    except BaseException:
        debug_string_out = funcH.print_and_add("Missing MulticoreTSNE package.. Only important if evaluating other manifold learners.", debug_string_out)
    np.set_printoptions(threshold=sys.maxsize)
    matplotlib.use('agg')
예제 #2
0
def n_eval_result(hle, y, y_pred, label_names, cluster_func_name, clusters_count, dataset_name, definition_string, pngnameadd, experiment_names_and_folders, optional_params=None, visualize=False):
    global debug_string_out
    y_pred = np.asarray(y_pred)
    # y_pred = y_pred.reshape(len(y_pred), )
    y = np.asarray(y)
    # y = y.reshape(len(y), )
    manifold_learner = funcH.get_attribute(optional_params, "manifold_learner", default_type=str, default_val='UMAP')

    kluster_centroids = funcH.get_cluster_centroids(hle, y_pred, kluster_centers=None, verbose=0)
    acc_doga = n_get_acc(y, y_pred, centroid_info_pdf=kluster_centroids)
    acc_doga_wo_kluster_centroids = n_get_acc(y, y_pred, centroid_info_pdf=None)
    debug_string_out = funcH.print_and_add("acc_doga:" + "{:6.4f}".format(acc_doga) + ", acc_doga_wo_kluster_centroids:" + "{:6.4f}".format(acc_doga_wo_kluster_centroids), debug_string_out)
    acc = np.round(cluster_acc(y, y_pred), 5)
    nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
    debug_string_out = funcH.print_and_add(definition_string + "-" + dataset_name + " | " + manifold_learner +
          " on autoencoded embedding with " + cluster_func_name + " - N2D", debug_string_out)
    debug_string_out = funcH.print_and_add("acc_doga(%6.3f),acc(%6.3f),nmi(%6.3f),ari(%6.3f)" % (acc_doga, acc, nmi, ari), debug_string_out)
    if visualize:
        try:
            n2d_plot(hle, y, clusters_count=clusters_count, plot_id='n2d'+pngnameadd,
                     file_name_plot_fig_full=experiment_names_and_folders["file_name_plot_fig_full"],
                     file_name_plot_csv_full=experiment_names_and_folders["file_name_plot_csv_full"],
                     label_names=label_names)
            y_pred_viz, _, _ = best_cluster_fit(y, y_pred)
            n2d_plot(hle, y_pred_viz, clusters_count=clusters_count, plot_id='n2d-predicted'+pngnameadd,
                     file_name_plot_fig_full=experiment_names_and_folders["file_name_plot_fig_full"],
                     file_name_plot_csv_full=experiment_names_and_folders["file_name_plot_csv_full"],
                     label_names=label_names)
        except:
            debug_string_out = funcH.print_and_add("could not visualize", debug_string_out)
    return y_pred, acc, nmi, ari, acc_doga
예제 #3
0
def n_run_autoencode(x, args):
    global debug_string_out
    # input_dict :
    # fit_verbose
    input_dict = argparse.ArgumentParser(description='func_autoencode', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    input_dict.add_argument('--experiments_folder_base', default=funcH.getVariableByComputerName("n2d_experiments"))
    input_dict.add_argument('--n_clusters', default=10, type=int)
    input_dict.add_argument('--dataset', default='mnist')
    input_dict.add_argument('--batch_size', default=256, type=int)
    input_dict.add_argument('--pretrain_epochs', default=100, type=int)
    input_dict.add_argument('--fit_verbose', default=True, type=bool)
    args2 = funcH._parse_args(input_dict, args, print_args=True)

    shape = [x.shape[-1], 500, 500, 2000, args2.n_clusters]
    ae = _autoencoder(shape)
    hidden = ae.get_layer(name='encoder_%d' % (len(shape) - 2)).output
    encoder = Model(inputs=ae.input, outputs=hidden)

    print("checking if ", args.experiment_names_and_folders["file_name_ae_weights_full"], " exist.")
    weights_file = args.experiment_names_and_folders["file_name_ae_weights_full"]
    load_file_skip_learning = os.path.isfile(weights_file)
    t = funcH.Timer()

    # Pretrain autoencoders before clustering
    if load_file_skip_learning:
        debug_string_out = funcH.print_and_add("Load weigths from(" + weights_file + ")", debug_string_out)
        ae.load_weights(weights_file)
    else:
        optimizer = 'adam'
        ae.compile(loss='mse', optimizer=optimizer)
        ae.fit(x, x, batch_size=args2.batch_size, epochs=args2.pretrain_epochs, verbose=1)
        t.end()
        ae.save_weights(weights_file)
        debug_string_out = funcH.print_and_add("Time to train the ae: " + t.get_elapsed_time(), debug_string_out)

    with open(args.experiment_names_and_folders["file_name_ae_params_text_full"], 'w') as f:
        f.write("\n".join([str(k)+":"+str(args2.__dict__[k]) for k in args2.__dict__]))

    hl = encoder.predict(x)
    return hl
예제 #4
0
def n_run_cluster(hle, n_clusters, cluster_func_name='GMM', experiment_names_and_folders=None, file_name_add=""):
    global debug_string_out
    debug_string_out = funcH.print_and_add("Clustering(" + cluster_func_name + ")" + str(datetime.now()), debug_string_out)
    t = funcH.Timer()
    if experiment_names_and_folders is not None:
        file_name_cluster_obj = experiment_names_and_folders["file_name_cluster_obj"].replace("<bef_aft>", file_name_add)
        if os.path.isfile(file_name_cluster_obj):
            cluster_obj = funcH.load_dict_fr_file(file_name_cluster_obj)
        else:
            cluster_obj = Clusterer(cluster_model=cluster_func_name, n_clusters=n_clusters, max_try_cnt=1).fit(hle, post_analyze_distribution=True, verbose=1)
            try:
                funcH.print_fancy("Trying to dump " + file_name_cluster_obj, style="Bold", textColor="Black", backColor='White', end='\n')
                funcH.dump_dict_to_file(file_name_cluster_obj, cluster_obj)
            except Exception as e:
                print(str(e))
                funcH.print_fancy("Couldn't dump " + file_name_cluster_obj, style="Bold", textColor="Red", end='\n')
    else:
        cluster_obj = Clusterer(cluster_model=cluster_func_name, n_clusters=n_clusters, max_try_cnt=1).fit(hle, post_analyze_distribution=True, verbose=1)

    y_pred, kluster_centroids = cluster_obj.predict(hle, post_analyze_distribution=True, verbose=1)
    debug_string_out = funcH.print_and_add("Time to cluster: " + t.get_elapsed_time(), debug_string_out)
    return y_pred, kluster_centroids
예제 #5
0
def script_hgsk():
    global debug_string_out
    pretrain_epochs = [10]
    ml = "UMAP"
    ds = "hgsk_256_41"
    for cluster in ['KM', 'GMM']:
        for ae_epoc in pretrain_epochs:
            for clust_cnt in [512, 1024]: #  umap_dim = 20, n_clusters_ae = 20, umap_neighbors = 40
                for umap_neighbors in [20, 30, 40]:
                    try:
                        debug_string_out.clear()
                        main(["--dataset", ds, "--gpu", "0",
                              "--pretrain_epochs", str(ae_epoc),
                              "--n_clusters", str(clust_cnt), "--cluster", cluster,
                              "--umap_dim", str(clust_cnt), "--umap_neighbors", str(umap_neighbors),
                              "--manifold_learner", ml, "--umap_min_dist", "0.00"])
                    except Exception as e:
                        debug_string_out = funcH.print_and_add(ds + '_' + ml + " - problem", debug_string_out)
                        debug_string_out = funcH.print_and_add(str(e), debug_string_out)
                        exp_date_str = str(datetime.now().strftime("%Y%m%d_%H%M")).replace('-', '')  # %S
                        with open(os.path.join(funcH.getVariableByComputerName("n2d_experiments"), ds + '_' + ml + '_error_' + exp_date_str + '.txt'), 'w') as f:
                            f.write("\n".join(debug_string_out))
예제 #6
0
def script():
    global debug_string_out
    pretrain_epochs = [10, 50]
    manifold_learners_all = ["UMAP"]
    dataset_names_all = ["cifar10", "mnist", "pendigits", "fashion"]  # , "usps", "har"
    cluster_func = "HDBSCAN"
    for ds in dataset_names_all:
        for ml in manifold_learners_all:
            for ae_epoc in pretrain_epochs:
                for clust_cnt in [20]: #  umap_dim = 20, n_clusters_ae = 20, umap_neighbors = 40
                    try:
                        debug_string_out = []
                        main(["--dataset", ds, "--gpu", "0",
                              "--pretrain_epochs", str(ae_epoc),
                              "--n_clusters", str(clust_cnt), '--cluster', str(cluster_func),
                              "--umap_dim", str(clust_cnt), "--umap_neighbors", str(2*clust_cnt),
                              "--manifold_learner", ml, "--umap_min_dist", "0.00"])
                    except Exception as e:
                        debug_string_out = funcH.print_and_add(ds + '_' + ml + " - problem \n" + str(e), debug_string_out)
                        exp_date_str = str(datetime.now().strftime("%Y%m%d_%H%M")).replace('-', '')  # %S
                        with open(os.path.join(funcH.getVariableByComputerName("n2d_experiments"), ds + '_' + ml + '_error_' + exp_date_str + '.txt'), 'w') as f:
                            f.write("\n".join(debug_string_out))
예제 #7
0
def get_args(argv):
    global debug_string_out
    parser = argparse.ArgumentParser(
        description='(Not Too) Deep',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--dataset', default='mnist', )
    parser.add_argument('--ae_weights', default=None)
    parser.add_argument('--experiments_folder_base', default=funcH.getVariableByComputerName("n2d_experiments"))
    parser.add_argument("--mode", default='client')
    parser.add_argument("--port", default=52162)
    parser.add_argument('--gpu', default=0, )
    parser.add_argument('--n_clusters', default=10, type=int)
    parser.add_argument('--batch_size', default=256, type=int)
    parser.add_argument('--pretrain_epochs', default=1000, type=int)
    parser.add_argument('--umap_dim', default=2, type=int)
    parser.add_argument('--umap_neighbors', default=10, type=int)
    parser.add_argument('--umap_min_dist', default="0.00", type=str)
    parser.add_argument('--umap_metric', default='euclidean', type=str)
    parser.add_argument('--cluster', default='GMM', type=str)
    parser.add_argument('--manifold_learner', default='UMAP', type=str)
    parser.add_argument('--visualize', default=False, type=bool)
    parser.add_argument('--rerun_last_plots', default=False, type=bool)
    args = funcH._parse_args(parser, argv, print_args=True)
    debug_string_out = funcH.print_and_add('-' * 80)

    experiment_names_and_folders = {
        "exp_date_str": str(datetime.now().strftime("%Y%m%d_")).replace('-', ''),  # %M%S,
        "exp_base_str": "_".join([args.dataset, "c" + str(args.cluster)+ str(args.n_clusters), "e" + str(args.pretrain_epochs)]),
        "folder_umap_data": os.path.join(args.experiments_folder_base, "exported_manifolds"),
        "folder_ae_weights": os.path.join(args.experiments_folder_base, "weights"),
    }
    experiment_names_and_folders["exp_extended"] = experiment_names_and_folders["exp_base_str"] + "_" + "_".join([args.manifold_learner + "ud" + str(args.umap_dim), "un" + str(args.umap_neighbors)])
    experiment_names_and_folders["folder_experiment"] = os.path.join(args.experiments_folder_base, args.dataset,
                                           experiment_names_and_folders["exp_date_str"] + experiment_names_and_folders["exp_extended"])
    experiment_names_and_folders["file_name_ae_weights_base"] = "aew_" + "_".join([args.dataset, "c" + str(args.n_clusters), "e" + str(args.pretrain_epochs)])
    experiment_names_and_folders["file_name_ae_weights_full"] = os.path.join(experiment_names_and_folders["folder_ae_weights"], experiment_names_and_folders["file_name_ae_weights_base"] + '.npy')
    experiment_names_and_folders["file_name_umap_data_base"] = "ulp" + experiment_names_and_folders["exp_extended"]
    experiment_names_and_folders["file_name_umap_data_full"] = os.path.join(experiment_names_and_folders["folder_umap_data"], experiment_names_and_folders["file_name_umap_data_base"] + '.npy')
    experiment_names_and_folders["file_name_arguments_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'args_' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.txt')
    experiment_names_and_folders["file_name_ae_params_text_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'args_autoencode_' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.txt')
    experiment_names_and_folders["file_name_plot_fig_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'plot_' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '_<plot_id>.png')
    experiment_names_and_folders["file_name_plot_csv_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'csv_' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.csv')
    experiment_names_and_folders["file_name_clusters_after_manifold_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'clusters_after_manifold-' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.txt')
    experiment_names_and_folders["file_name_clusters_before_manifold_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'clusters_before_manifold-' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.txt')
    experiment_names_and_folders["file_name_debug_string_out_full"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'debug_string_out-' + experiment_names_and_folders["exp_extended"] + "_" + experiment_names_and_folders["exp_date_str"] + '.txt')
    experiment_names_and_folders["file_name_result_csv_file_full"] = os.path.join(args.experiments_folder_base, 'results.csv')
    experiment_names_and_folders["file_name_data_before_manifold"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'data_' + experiment_names_and_folders["exp_extended"] + '_before.npz')
    experiment_names_and_folders["file_name_data_after_manifold"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'data_' + experiment_names_and_folders["exp_extended"] + '_after.npz')
    experiment_names_and_folders["file_name_cluster_obj"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'cluster_obj_' + experiment_names_and_folders["exp_extended"] + '_<bef_aft>.dictionary')
    experiment_names_and_folders["file_name_silhouette_results"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'silhouette_results_' + experiment_names_and_folders["exp_extended"] + '_<bef_aft>.npy')
    experiment_names_and_folders["file_name_results"] = os.path.join(experiment_names_and_folders["folder_experiment"], 'results_' + experiment_names_and_folders["exp_extended"] + '.dictionary')

    args.experiment_names_and_folders = experiment_names_and_folders

    # 4 folders folder_{experiment, umap_data, ae_weights}
    funcH.createDirIfNotExist(experiment_names_and_folders["folder_experiment"])
    funcH.createDirIfNotExist(experiment_names_and_folders["folder_umap_data"])
    funcH.createDirIfNotExist(experiment_names_and_folders["folder_ae_weights"])

    with open(experiment_names_and_folders["file_name_arguments_full"], 'w') as f:
        f.write("\n".join(argv))
    return args
예제 #8
0
def cluster_manifold_in_embedding(hl, y, cluster_func_name, clusters_count, dataset_name, experiment_names_and_folders, label_names=None, optional_params=None):
    global debug_string_out
    debug_string_out = funcH.print_and_add('=' * 80, debug_string_out)

    umap_dim = funcH.get_attribute(optional_params, "umap_dim", default_type=int, default_val=clusters_count)
    manifold_learner = funcH.get_attribute(optional_params, "manifold_learner", default_type=str, default_val="UMAP")

    funcH.add_attribute(optional_params, "umap_min_dist", funcH.get_attribute(optional_params, "umap_min_dist", default_type=float, default_val=0.0))
    funcH.add_attribute(optional_params, "umap_metric", funcH.get_attribute(optional_params, "umap_metric", default_type=str, default_val='euclidean'))
    funcH.add_attribute(optional_params, "umap_neighbors", funcH.get_attribute(optional_params, "umap_neighbors", default_type=int, default_val=20))
    y_pred_hl, kluster_centroids_before = n_run_cluster(hl, n_clusters=clusters_count, cluster_func_name=cluster_func_name, experiment_names_and_folders=experiment_names_and_folders, file_name_add="-nm")
    y_pred_hl, acc_hl, nmi_hl, ari_hl, acc_hl_dg = n_eval_result(hl, y, y_pred_hl, label_names,
                                                                 cluster_func_name, clusters_count,
                                                                 dataset_name, definition_string="if no manifold stuff",
                                                                 pngnameadd='-nm',
                                                                 experiment_names_and_folders=experiment_names_and_folders,
                                                                 optional_params=None, visualize=False)
    try:
        funcH.print_fancy("Trying to save " + experiment_names_and_folders["file_name_data_before_manifold"], style="Bold", textColor="Black", backColor='White', end='\n')
        np.savez(experiment_names_and_folders["file_name_data_before_manifold"], featVec=hl, labels=y, preds=y_pred_hl, label_names=label_names, acc=acc_hl_dg)
    except Exception as e:
        print(str(e))
        funcH.print_fancy("Couldn't save " + experiment_names_and_folders["file_name_data_before_manifold"], style="Bold", textColor="Red", end='\n')
    debug_string_out = funcH.print_and_add('-' * 40, debug_string_out)

    file_name_silhouette_results = experiment_names_and_folders["file_name_silhouette_results"].replace("<bef_aft>", "before")
    if os.path.isfile(file_name_silhouette_results):
        print("loading silhouette_values from ", file_name_silhouette_results)
        silhouette_values_hl = np.load(file_name_silhouette_results, allow_pickle=True)
    else:
        _, silhouette_values_hl = funcH.calc_silhouette_params(hl, y_pred_hl)
        print("saving silhouette_values to ", file_name_silhouette_results)
        try:
            funcH.print_fancy("Trying to save " + file_name_silhouette_results, style="Bold", textColor="Black", backColor='White', end='\n')
            np.save(file_name_silhouette_results, silhouette_values_hl, allow_pickle=True)
        except Exception as e:
            print(str(e))
            funcH.print_fancy("Couldn't save " + file_name_silhouette_results, style="Bold", textColor="Red", end='\n')

    # find manifold on autoencoded embedding
    hle = n_learn_manifold(hl, embedding_dim=umap_dim, manifold_learner=manifold_learner,
                           manifold_out_file_name=experiment_names_and_folders["file_name_umap_data_full"],
                           optional_params=optional_params)
    # clustering on new manifold of autoencoded embedding
    y_pred_hle, kluster_centroids_after = n_run_cluster(hle, n_clusters=clusters_count, cluster_func_name=cluster_func_name, experiment_names_and_folders=experiment_names_and_folders, file_name_add="-hle")
    y_pred_hle, acc, nmi, ari, acc_dg = \
        n_eval_result(hle, y, y_pred_hle, label_names, cluster_func_name, clusters_count,
                      dataset_name, definition_string="after manifold stuff",
                      pngnameadd='-hle', experiment_names_and_folders=experiment_names_and_folders,
                      optional_params=None, visualize=False)
    saveToFileName = experiment_names_and_folders["file_name_data_after_manifold"]
    try:
        funcH.print_fancy("Trying to save " + saveToFileName, style="Bold", textColor="Black",
                          backColor='White', end='\n')
        np.savez(saveToFileName, featVec=hle, labels=y, preds=y_pred_hle, label_names=label_names, acc=acc_dg)
    except Exception as e:
        print(str(e))
        funcH.print_fancy("Couldn't save " + saveToFileName, style="Bold", textColor="Red", end='\n')

    debug_string_out = funcH.print_and_add('=' * 80, debug_string_out)

    file_name_silhouette_results = experiment_names_and_folders["file_name_silhouette_results"].replace("<bef_aft>", "after")
    if os.path.isfile(file_name_silhouette_results):
        print("loading silhouette_values from ", file_name_silhouette_results)
        silhouette_values_hle = np.load(file_name_silhouette_results, allow_pickle=True)
    else:
        _, silhouette_values_hle = funcH.calc_silhouette_params(hle, y_pred_hle)
        print("saving silhouette_values to ", file_name_silhouette_results)
        try:
            funcH.print_fancy("Trying to save " + file_name_silhouette_results, style="Bold", textColor="Black",
                              backColor='White', end='\n')
            np.save(file_name_silhouette_results, silhouette_values_hle, allow_pickle=True)
        except Exception as e:
            print(str(e))
            funcH.print_fancy("Couldn't save " + file_name_silhouette_results, style="Bold", textColor="Red", end='\n')



    results_dict = {
        "acc_before_manifold": acc_hl,
        "acc_before_manifold_dg": acc_hl_dg,
        "acc_after_manifold_dg": acc_dg,
        "acc_after_manifold": acc,
        "nmi_before_manifold": nmi_hl,
        "nmi_after_manifold": nmi,
        "ari_before_manifold": ari_hl,
        "ari_after_manifold": ari,
        "pred_before_manifold": y_pred_hl,
        "pred_after_manifold": y_pred_hle,
        "silhouette_values_before": silhouette_values_hl,
        "silhouette_values_after": silhouette_values_hle,
        "kluster_centroids_before": kluster_centroids_before,
        "kluster_centroids_after": kluster_centroids_after,
    }
    return results_dict
예제 #9
0
 def print_and_remember(self, print_str):
     self.debug_string_out = funcH.print_and_add(print_str,
                                                 self.debug_string_out)