Exemplo n.º 1
0
def main():
    output_path = Path('../output/try2_exactly_7_times')
    output_path.mkdir(exist_ok=True)
    save_path = output_path / 'vader.ckpt'

    # w_train, x_train, names = read_premade(DAYS_ORDERED)
    w_train, x_train, names = read_data()
    x_train = (x_train - np.mean(x_train)) / np.std(x_train)

    vader = VADER(x_train=x_train,
                  w_train=w_train,
                  save_path=save_path,
                  n_hidden=[128, 32],
                  k=5,
                  learning_rate=1e-3,
                  output_activation=None,
                  recurrent=True,
                  batch_size=8,
                  alpha=0.1)
    # pre-train without latent loss
    vader.pre_fit(n_epoch=20, verbose=True)
    # train with latent loss
    vader.fit(n_epoch=100, verbose=True)
    # get the clusters
    c = vader.cluster(x_train, w_train)
    # get the re-constructions
    p = vader.predict(x_train)

    print(vader.get_clusters_on_x())
Exemplo n.º 2
0
    def test_vader_recur(self):
        X_train, W_train, y_train = generate_x_w_y(7, 400)
        # Note: y_train is used purely for monitoring performance when a ground truth clustering is available.
        # It can be omitted if no ground truth is available.
        # noinspection PyTypeChecker
        vader = VADER(X_train=X_train,
                      W_train=W_train,
                      y_train=y_train,
                      save_path=None,
                      n_hidden=[12, 2],
                      k=4,
                      learning_rate=1e-3,
                      output_activation=None,
                      recurrent=True,
                      batch_size=16)

        # pre-train without latent loss
        vader.pre_fit(n_epoch=10, verbose=True)
        # train with latent loss
        vader.fit(n_epoch=10, verbose=True)
        # get the clusters
        clustering = vader.cluster(X_train)
        assert any(clustering)
        assert len(clustering) == len(X_train)
        # get the re-constructions
        prediction = vader.predict(X_train)
        assert prediction.shape == X_train.shape
        # compute the loss given the network
        loss = vader.get_loss(X_train)
        assert loss
        assert "reconstruction_loss" in loss
        assert "latent_loss" in loss
        assert loss["reconstruction_loss"] >= 0
        assert loss["latent_loss"] >= 0
Exemplo n.º 3
0
def test1():
    save_path = os.path.join('test_vader', 'vader.ckpt')
    x_train, y_train, w_train = prepare_data()
    # json.dump(x_train, open("x_train.json", "wb"))
    # json.dump(y_train, open("y_train.json", "wb"))
    # json.dump(w_train, open("w_train.json", "wb"))
    pickle.dump(x_train, open("x_train.pickle", "wb"))
    pickle.dump(y_train, open("y_train.pickle", "wb"))
    pickle.dump(w_train, open("w_train.pickle", "wb"))
    # Note: y_train is used purely for monitoring performance when a ground truth clustering is available.
    # It can be omitted if no ground truth is available.
    vader = VADER(x_train=x_train,
                  w_train=w_train,
                  y_train=y_train,
                  save_path=save_path,
                  n_hidden=[12, 2],
                  k=4,
                  learning_rate=1e-3,
                  output_activation=None,
                  recurrent=True,
                  batch_size=16)
    # pre-train without latent loss
    vader.pre_fit(n_epoch=50, verbose=True)
    # train with latent loss
    vader.fit(n_epoch=50, verbose=True)
    # get the clusters
    c = vader.cluster(x_train)
    # get the re-constructions
    p = vader.predict(x_train)
Exemplo n.º 4
0
    def test_vader_save_load(self):
        save_path = "test_vader_save_load"
        if os.path.exists(save_path):
            shutil.rmtree(save_path)

        X_train, W_train, y_train = generate_x_w_y(7, 400)
        # noinspection PyTypeChecker
        vader = VADER(X_train=X_train,
                      W_train=W_train,
                      y_train=y_train,
                      save_path=save_path,
                      n_hidden=[12, 2],
                      k=4,
                      learning_rate=1e-3,
                      output_activation=None,
                      recurrent=True,
                      batch_size=16)
        vader.pre_fit(n_epoch=10, verbose=True)
        vader.fit(n_epoch=10, verbose=True)
        clustering_before_loading = vader.cluster(X_train)

        loaded_vader = VADER.load_model(save_path, X_train, W_train, y_train)
        clustering_after_loading = loaded_vader.cluster(X_train)

        if os.path.exists(save_path):
            shutil.rmtree(save_path)

        assert list(clustering_before_loading) == list(
            clustering_after_loading)
Exemplo n.º 5
0
    def _fit_vader(self, X_train: ndarray,
                   W_train: Optional[ndarray]) -> VADER:
        k = self.params_dict["k"]
        n_hidden = self.params_dict["n_hidden"]
        learning_rate = self.params_dict["learning_rate"]
        batch_size = self.params_dict["batch_size"]
        alpha = self.params_dict["alpha"]

        # noinspection PyTypeChecker
        vader = VADER(X_train=X_train,
                      W_train=W_train,
                      save_path=None,
                      n_hidden=n_hidden,
                      k=k,
                      seed=self.seed,
                      learning_rate=learning_rate,
                      recurrent=True,
                      batch_size=batch_size,
                      alpha=alpha)

        vader.pre_fit(n_epoch=10, verbose=False)
        vader.fit(n_epoch=self.n_epoch,
                  verbose=False,
                  early_stopping_ratio=self.early_stopping_ratio,
                  early_stopping_batch_size=self.early_stopping_batch_size)
        return vader
Exemplo n.º 6
0
 def test_vader_nonrecur(self):
     NUM_OF_TIME_POINTS = 7
     X_train, y_train = generate_x_y_for_nonrecur(NUM_OF_TIME_POINTS, 400)
     # Run VaDER non-recurrently (ordinary VAE with GM prior)
     # noinspection PyTypeChecker
     vader = VADER(X_train=X_train,
                   y_train=y_train,
                   n_hidden=[12, 2],
                   k=2,
                   learning_rate=1e-3,
                   output_activation=None,
                   recurrent=False,
                   batch_size=16)
     # pre-train without latent loss
     vader.pre_fit(n_epoch=10, verbose=True)
     # train with latent loss
     vader.fit(n_epoch=10, verbose=True)
     # get the clusters
     clustering = vader.cluster(X_train)
     assert any(clustering)
     assert len(clustering) == len(X_train)
     # get the re-constructions
     prediction = vader.predict(X_train)
     assert prediction.shape == X_train.shape
     # compute the loss given the network
     loss = vader.get_loss(X_train)
     assert loss
     assert "reconstruction_loss" in loss
     assert "latent_loss" in loss
     assert loss["reconstruction_loss"] >= 0
     assert loss["latent_loss"] >= 0
     # generate some samples
     NUM_OF_GENERATED_SAMPLES = 10
     generated_samples = vader.generate(NUM_OF_GENERATED_SAMPLES)
     assert generated_samples
     assert "clusters" in generated_samples
     assert "samples" in generated_samples
     assert len(generated_samples["clusters"]) == NUM_OF_GENERATED_SAMPLES
     assert generated_samples["samples"].shape == (NUM_OF_GENERATED_SAMPLES,
                                                   NUM_OF_TIME_POINTS)
Exemplo n.º 7
0
 if args.n_consensus and args.n_consensus > 1:
     loss_history_pdf = matplotlib.backends.backend_pdf.PdfPages(
         loss_history_file_path)
     y_pred_repeats = []
     effective_k_repeats = []
     train_reconstruction_loss_repeats = []
     train_latent_loss_repeats = []
     for j in range(args.n_consensus):
         seed = f"{args.seed}{i}{j}" if args.seed else None
         # noinspection PyTypeChecker
         vader = VADER(X_train=input_data,
                       W_train=input_weights,
                       k=args.k,
                       n_hidden=n_hidden,
                       learning_rate=args.learning_rate,
                       batch_size=args.batch_size,
                       alpha=args.alpha,
                       seed=args.seed,
                       save_path=args.save_path,
                       output_activation=None,
                       recurrent=True)
         vader.pre_fit(n_epoch=10, verbose=False)
         vader.fit(n_epoch=args.n_epoch,
                   verbose=False,
                   early_stopping_ratio=args.early_stopping_ratio,
                   early_stopping_batch_size=args.early_stopping_batch_size)
         fig = plot_loss_history(vader, model_name=f"Model #{j}")
         loss_history_pdf.savefig(fig)
         # noinspection PyTypeChecker
         clustering = vader.cluster(input_data, input_weights)
         effective_k = len(Counter(clustering))
Exemplo n.º 8
0
    data_reader_spec = importlib.util.spec_from_file_location(
        "data_reader", args.data_reader_script)
    data_reader_module = importlib.util.module_from_spec(data_reader_spec)
    data_reader_spec.loader.exec_module(data_reader_module)
    data_reader = data_reader_module.DataReader()

    x_tensor = data_reader.read_data(args.input_data_file)
    w_tensor = generate_wtensor_from_xtensor(x_tensor)
    input_data = np.nan_to_num(x_tensor)
    input_weights = w_tensor
    features = data_reader.features
    time_points = data_reader.time_points
    x_label = data_reader.time_point_meaning
    ids_list = data_reader.ids_list

    vader = VADER.load_model(args.load_path, input_data, input_weights)
    n_hidden = [str(layer_size) for layer_size in vader.n_hidden]
    report_suffix = f"k{str(vader.K)}" \
                    f"_n_hidden{'_'.join(n_hidden)}" \
                    f"_learning_rate{str(vader.learning_rate)}" \
                    f"_batch_size{str(vader.batch_size)}" \
                    f"_n_epoch{str(vader.n_epoch)}" \
                    f"_seed{str(args.seed)}"
    plot_file_path = os.path.join(
        args.output_path, f"z_scores_trajectories_{report_suffix}.pdf")
    clustering_file_path = os.path.join(args.output_path,
                                        f"clustering_{report_suffix}.csv")
    clustering = vader.cluster(input_data, input_weights)
    pd.Series(list(clustering), index=ids_list, dtype=np.int64,
              name='Cluster').to_csv(clustering_file_path)
Exemplo n.º 9
0
# Randomly set 50% of values to missing (0: missing, 1: present)
# Note: All X_train[i,j] for which W_train[i,j] == 0 are treated as missing (i.e. their specific value is ignored)
W_train = np.random.choice(2, X_train.shape)

import pickle
pickle.dump(X_train, open("X_train.pickle", "wb"))
pickle.dump(y_train, open("y_train.pickle", "wb"))
pickle.dump(W_train, open("W_train.pickle", "wb"))

# Note: y_train is used purely for monitoring performance when a ground truth clustering is available.
# It can be omitted if no ground truth is available.
vader = VADER(X_train=X_train,
              W_train=W_train,
              y_train=y_train,
              save_path=save_path,
              n_hidden=[12, 2],
              k=4,
              learning_rate=1e-3,
              output_activation=None,
              recurrent=True,
              batch_size=16)

# pre-train without latent loss
vader.pre_fit(n_epoch=50, verbose=True)
# train with latent loss
vader.fit(n_epoch=50, verbose=True)
# get the clusters
c = vader.cluster(X_train)
# get the re-constructions
p = vader.predict(X_train)
# compute the loss given the network
l = vader.get_loss(X_train)
Exemplo n.º 10
0
def test2():
    x_train, y_train = get_dete_for_seconed_test()
    vader = VADER(x_train=x_train,
                  y_train=y_train,
                  n_hidden=[12, 2],
                  k=2,
                  learning_rate=1e-3,
                  output_activation=None,
                  recurrent=False,
                  batch_size=16)
    # pre-train without latent loss
    vader.pre_fit(n_epoch=50, verbose=True)
    # train with latent loss
    vader.fit(n_epoch=50, verbose=True)
    # get the clusters
    c = vader.cluster(x_train)
    # get the re-constructions
    p = vader.predict(x_train)
    # compute the loss given the network
    l = vader.get_loss(x_train)
    # generate some samples
    g = vader.generate(10)
    # compute the loss given the network
    l = vader.get_loss(x_train)
Exemplo n.º 11
0
    def test_vader_transfer_learning(self):
        X_train, W_train, y_train = generate_x_w_y(7, 400)
        # noinspection PyTypeChecker
        vader = VADER(X_train=X_train,
                      W_train=W_train,
                      y_train=y_train,
                      save_path=None,
                      n_hidden=[12, 2],
                      k=4,
                      learning_rate=1e-3,
                      output_activation=None,
                      recurrent=True,
                      batch_size=16)
        # pre-train without latent loss
        vader.pre_fit(n_epoch=10, verbose=True)
        # train with latent loss
        vader.fit(n_epoch=10, verbose=True)

        X_train_ft, W_train_ft, y_train_ft = generate_x_w_y(7, 400)
        vader.set_inputs(X_train_ft, W_train_ft, y_train_ft)
        # pre-train without latent loss
        vader.pre_fit(n_epoch=10, verbose=True)
        # train with latent loss
        vader.fit(n_epoch=10, verbose=True)

        # get the clusters
        clustering = vader.cluster(X_train_ft)
        assert any(clustering)
        assert len(clustering) == len(X_train_ft)
        # get the re-constructions
        prediction = vader.predict(X_train_ft)
        assert prediction.shape == X_train_ft.shape
        # compute the loss given the network
        loss = vader.get_loss(X_train_ft)
        assert loss
        assert "reconstruction_loss" in loss
        assert "latent_loss" in loss
        assert loss["reconstruction_loss"] >= 0
        assert loss["latent_loss"] >= 0
Exemplo n.º 12
0
    def test_vader_save_load_transfer_learning(self):
        save_folder = "test_vader_save_load_transfer_learning"
        save_path = f"{save_folder}//weights"

        if os.path.exists(save_folder):
            shutil.rmtree(save_folder)

        X_train, W_train, y_train = generate_x_w_y(7, 400)
        # noinspection PyTypeChecker
        vader = VADER(X_train=X_train,
                      W_train=W_train,
                      y_train=y_train,
                      save_path=save_path,
                      n_hidden=[12, 2],
                      k=4,
                      learning_rate=1e-3,
                      output_activation=None,
                      recurrent=True,
                      batch_size=16)
        vader.pre_fit(n_epoch=10, verbose=True)
        vader.fit(n_epoch=10, verbose=True)
        clustering_before_loading = vader.cluster(X_train)

        X_train_ft, W_train_ft, y_train_ft = generate_x_w_y(7, 400)
        vader = VADER(X_train=X_train_ft,
                      W_train=W_train_ft,
                      y_train=y_train_ft,
                      save_path=None,
                      n_hidden=[12, 2],
                      k=4,
                      learning_rate=1e-3,
                      output_activation=None,
                      recurrent=True,
                      batch_size=16)
        vader.load_weights(save_path)
        vader.pre_fit(n_epoch=10, verbose=True)
        vader.fit(n_epoch=10, verbose=True)
        # get the clusters
        clustering = vader.cluster(X_train_ft)

        if os.path.exists(save_folder):
            shutil.rmtree(save_folder)

        assert any(clustering)
        assert len(clustering) == len(X_train_ft)
        # get the re-constructions
        prediction = vader.predict(X_train_ft)
        assert prediction.shape == X_train_ft.shape
        # compute the loss given the network
        loss = vader.get_loss(X_train_ft)
        assert loss
        assert "reconstruction_loss" in loss
        assert "latent_loss" in loss
        assert loss["reconstruction_loss"] >= 0
        assert loss["latent_loss"] >= 0