Exemplo n.º 1
0
    def set_dgm_layers(self, input_shape, num_classes, is_hebb_layers=False, is_clamp=False, extra_class=False):
        import numpy as np
        self.input_shape = input_shape
        self.num_classes = num_classes
        self.input_size = np.prod(input_shape)
        self.set_vae_layers()
        self.encoder = Encoder(input_size=self.input_size, h_dim=self.h_dims, z_dim=self.z_dim,
                               num_classes=self.num_classes, y_dim=self.num_classes)
        self.decoder = Decoder(self.z_dim, list(reversed(self.h_dims)), self.input_size, num_classes=self.num_classes)

        hs = [self.h_dims[0] for _ in range(self.n_h)]
        if self.indices_names is None:
            self.indices_names = list(range(self.input_size))
        # The extra_class is previously added; this would put a second extra-class
        self.classifier = MLP(self.input_size, self.input_shape, self.indices_names, hs, self.num_classes,
                              dropout=self.dropout, is_hebb_layers=is_hebb_layers, is_clamp=is_clamp, gt_input=self.gt_input,
                              extra_class=extra_class)

        for m in self.modules():
            if isinstance(m, nn.Linear):
                init.kaiming_normal_(m.weight.data)
                if m.bias is not None:
                    m.bias.data.zero_()
Exemplo n.º 2
0
def __main__():
    local_folder = "./data/kaggle_dessins/"
    train_images_fname = "train_images.npy"
    train_labels_fname = "train_labels.csv"
    home_path = "/home/simon/"
    destination_folder = "annleukemia"
    data_folder = "data"
    results_folder = "results"
    extra_class = True # TODO change to put the number... curious to see if more than one is desirable
    meta_destination_folder = "pandas_meta_df"
    plots_folder_path = "/".join([home_path, destination_folder, results_folder, "plots/"])

    dataset_name = "dessins"
    activation = "relu"
    early_stopping = 200
    n_epochs = 1000
    gt_input = 0
    use_conv = False  # Not applicable if not sequence (images, videos, sentences, DNA...)

    lr = 1e-5
    l1 = 1e-5
    l2 = 1e-10
    dropout = 0.5
    batch_size = 16
    is_pruning = False
    # mc = 1
    # iw = 1

    # Neurons layers
    h_dims = [1024, 1024, 1024]

    from utils.utils import adapt_datasets
    train_arrays = np.load(local_folder + train_images_fname, encoding="latin1")
    train_dataset = np.vstack(train_arrays[:, 1])
    train_labels = genfromtxt(local_folder + train_labels_fname, delimiter=",", dtype=str, skip_header=True)[:, 1]
    test_dataset = np.vstack(np.load(local_folder + "test_images.npy", encoding="latin1")[:, 1])

    meta_df = pd.DataFrame(train_dataset, columns=train_labels)
    img_shape = [1, 100, 100]
    labels = set(list(meta_df.columns))

    mlp = MLP(input_size=meta_df.shape[0], input_shape=(meta_df.shape[0]),
              indices_names=list(range(meta_df.shape[0])), num_classes=len(labels),
              h_dims=h_dims, extra_class=extra_class, l1=l1, l2=l2, batch_norm=True)

    mlp.labels = labels
    mlp.labels_set = list(set(labels))

    mlp.set_configs(home_path=home_path, results_folder=results_folder, data_folder=data_folder,
                    destination_folder=destination_folder, dataset_name=dataset_name, lr=lr,
                    meta_destination_folder="meta_pandas_dataframes", csv_filename="csv_loggers", is_unlabelled=False)
    print("Labeled data shape (35371, 624)", meta_df.shape)
    if meta_df is not None:
        mlp.import_dataframe(meta_df, batch_size, labelled=True)

    train_total_loss_histories = [[] for x in range(10)]
    train_accuracy_histories = [[] for x in range(10)]
    valid_total_loss_histories = [[] for x in range(10)]
    valid_accuracy_histories = [[] for x in range(10)]
    for i in range(10):
        print("Random train/valid split", i)
        mlp.set_data(labels_per_class=-1, is_example=False, extra_class=extra_class)
        mlp.glorot_init()
        mlp.run(n_epochs, verbose=3, show_progress=10, hist_epoch=20, is_balanced_relu=False, all0=False)
Exemplo n.º 3
0
def __main__():
    from models.discriminative.artificial_neural_networks.MultiLayerPerceptron import MLP
    home_path = "/home/simon/"
    destination_folder = ""
    data_folder = "data"
    results_folder = "results"
    meta_destination_folder = "pandas_meta_df"
    plots_folder_path = "/".join(
        [home_path, destination_folder, results_folder, "plots/"])

    dataset_name = "mnist_dropout"
    activation = "relu"
    early_stopping = 200
    n_epochs = 1000
    gt_input = -1e6
    gt = -1e6
    use_conv = False  # Not applicable if not sequence (images, videos, sentences, DNA...)

    lr = 1e-3
    l1 = 0
    l2 = 0
    dropout = 0.5
    batch_size = 64
    is_pruning = True
    # mc = 1
    # iw = 1

    # Neurons layers
    h_dims = [128, 128]

    mlp = MLP(input_size=784,
              input_shape=(1, 28, 28),
              indices_names=list(range(784)),
              num_classes=10,
              h_dims=h_dims,
              extra_class=True,
              l1=l1,
              l2=l2,
              gt_input=gt_input,
              is_pruning=is_pruning,
              dropout=dropout,
              destination_folder=home_path + "/" + destination_folder,
              gt=gt)

    mlp.set_configs(home_path=home_path,
                    results_folder=results_folder,
                    data_folder=data_folder,
                    destination_folder=destination_folder,
                    dataset_name=dataset_name,
                    lr=lr,
                    meta_destination_folder="meta_pandas_dataframes",
                    csv_filename="csv_loggers")

    mlp.load_example_dataset(dataset="mnist",
                             batch_size=batch_size,
                             extra_class=True,
                             unlabelled_train_ds=False,
                             normalize=True,
                             mu=0.1307,
                             var=0.3081,
                             labels_per_class=-1,
                             unlabelled_samples=False)

    mlp.set_data(labels_per_class=-1,
                 is_example=True,
                 extra_class=True,
                 ignore_training_inputs=3)

    mlp.cuda()
    # dgm.vae.generate_random(False, batch_size, z1_size, [1, 28, 28])
    mlp.run(n_epochs, start_pruning=3)
Exemplo n.º 4
0
def __main__():
    import os

    os.chdir("..")  # To return at the root of the project

    from models.discriminative.artificial_neural_networks.MultiLayerPerceptron import MLP
    geo_ids = ["GSE33000"]
    unlabelled_geo_ids = ["GSE33000"]
    load_from_disk = True
    load_merge = False
    home_path = "/home/simon/"
    destination_folder = "annleukemia"
    data_folder = "data"
    results_folder = "results"
    translate = "f"
    extra_class = True  # TODO change to put the number... curious to see if more than one is desirable
    meta_destination_folder = "pandas_meta_df"
    plots_folder_path = "/".join(
        [home_path, destination_folder, results_folder, "plots/"])

    dataset_name = "gse33000"
    activation = "relu"
    early_stopping = 200
    n_epochs = 1000
    gt_input = 0
    use_conv = False  # Not applicable if not sequence (images, videos, sentences, DNA...)

    lr = 1e-3
    l1 = 0.
    l2 = 0.
    dropout = 0.5
    batch_size = 32
    is_pruning = False
    # mc = 1
    # iw = 1

    # Neurons layers
    h_dims = [128, 128]

    from utils.utils import adapt_datasets
    g = GeoParser(home_path=home_path, geo_ids=geo_ids)
    g.get_geo(load_from_disk=load_from_disk, automatic_attribute_list=None)
    meta_df = g.merge_datasets(load_from_disk=load_merge, labelled=True)

    labels = set(list(meta_df.columns))

    mlp = MLP(input_size=meta_df.shape[0],
              input_shape=(meta_df.shape[0]),
              indices_names=list(range(meta_df.shape[0])),
              num_classes=len(labels),
              h_dims=h_dims,
              extra_class=extra_class,
              l1=l1,
              l2=l2,
              batch_norm=True)

    mlp.labels = labels
    mlp.labels_set = list(set(labels))

    mlp.set_configs(home_path=home_path,
                    results_folder=results_folder,
                    data_folder=data_folder,
                    destination_folder=destination_folder,
                    dataset_name=dataset_name,
                    lr=lr,
                    meta_destination_folder="meta_pandas_dataframes",
                    csv_filename="csv_loggers",
                    is_unlabelled=False)
    print("Labeled data shape (35371, 624)", meta_df.shape)
    if meta_df is not None:
        mlp.import_dataframe(meta_df, batch_size, labelled=True)

    train_total_loss_histories = [[] for x in range(10)]
    train_accuracy_histories = [[] for x in range(10)]
    valid_total_loss_histories = [[] for x in range(10)]
    valid_accuracy_histories = [[] for x in range(10)]
    for i in range(10):
        print("Random train/valid split", i)
        mlp.set_data(labels_per_class=-1,
                     is_example=False,
                     extra_class=extra_class)
        mlp.glorot_init()
        mlp.run(n_epochs,
                verbose=2,
                show_progress=10,
                hist_epoch=20,
                is_balanced_relu=False,
                all0=False)
Exemplo n.º 5
0
def main():
    from data_preparation.GeoParser import GeoParser
    from models.discriminative.artificial_neural_networks.MultiLayerPerceptron import MLP
    load_from_disk = True
    load_merge = False

    geo_ids = ["GSE33000"]
    # files_destinations
    home_path = "/home/simon/"
    destination_folder = "annleukemia"
    data_folder = "data"
    results_folder = "results"
    meta_destination_folder = "pandas_meta_df"

    plots_folder_path = "/".join(
        [home_path, destination_folder, results_folder, "plots/"])
    dataset_name = "gse33000_no_huntington"
    activation = "relu"
    # nrep = 3
    early_stopping = 200
    n_epochs = 1000
    gt_input = 0
    extra_class = False
    dataset_name = dataset_name + "extra_class" + str(extra_class)
    # if ladder is yes builds a ladder vae. Do not combine with auxiliary (yet; might be possible and relatively
    # not too hard to implement, but might be overkill. Might be interesting too)
    translate = "n"

    use_conv = False  # Not applicable if not sequence (images, videos, sentences, DNA...)
    lr = 1e-4
    l1 = 0.
    l2 = 0.
    batch_size = 32
    # mc = 1
    # iw = 1

    # Neurons layers
    h_dims = [128, 128]
    from utils.utils import adapt_datasets
    g = GeoParser(home_path=home_path, geo_ids=geo_ids)
    g.get_geo(load_from_disk=load_from_disk, automatic_attribute_list=None)
    meta_df = g.merge_datasets(load_from_disk=load_merge, labelled=True)
    if translate is "y":
        for geo_id in geo_ids:
            g.translate_indices_df(geo_id, labelled=True)
    labels = set(list(meta_df.columns))
    print(labels)
    mlp = MLP(input_size=meta_df.shape[0],
              input_shape=(meta_df.shape[0]),
              indices_names=list(range(meta_df.shape[0])),
              num_classes=len(labels),
              h_dims=h_dims,
              extra_class=extra_class,
              l1=l1,
              l2=l2,
              batch_norm=True)

    mlp.set_configs(home_path=home_path,
                    results_folder=results_folder,
                    data_folder=data_folder,
                    destination_folder=destination_folder,
                    dataset_name=dataset_name,
                    lr=lr,
                    meta_destination_folder="meta_pandas_dataframes",
                    csv_filename="csv_loggers",
                    is_unlabelled=False)

    print("Labeled data shape (35371, 624)", meta_df.shape)
    if meta_df is not None:
        mlp.import_dataframe(meta_df, batch_size, labelled=True)

    train_total_loss_histories = [[] for x in range(10)]
    train_accuracy_histories = [[] for x in range(10)]
    valid_total_loss_histories = [[] for x in range(10)]
    valid_accuracy_histories = [[] for x in range(10)]
    for i in range(100):
        print("Random train/valid split", i)
        mlp.set_data(labels_per_class=-1,
                     is_example=False,
                     extra_class=extra_class,
                     ignore_training_inputs=1)
        mlp.glorot_init()
        mlp.run(n_epochs,
                verbose=0,
                show_progress=10,
                hist_epoch=20,
                is_balanced_relu=True,
                all0=True,
                overall_mean=True)
Exemplo n.º 6
0
def __main__():
    train_labels_fname = "train_labels.csv"
    home_path = "/home/simon/"
    destination_folder = "annleukemia"
    data_folder = "data"
    results_folder = "results"
    extra_class = True  # TODO change to put the number... curious to see if more than one is desirable

    dataset_name = "dessins"
    n_epochs = 1000

    lr = 1e-3
    l1 = 0.
    l2 = 0.
    batch_size = 256
    resized_shape = 100
    # Neurons layers
    h_dims = [1024, 512, 256, 128]
    input_shape = [1, 100, 100]
    input_size = np.prod([1, resized_shape, resized_shape])

    dir = "data/kaggle_dessins/"
    train_labels = genfromtxt(dir + train_labels_fname,
                              delimiter=",",
                              dtype=str,
                              skip_header=True)[:, 1]
    train_labels_set = set(train_labels)
    data_transform = transforms.Compose([
        transforms.RandomSizedCrop((resized_shape)),
        transforms.RandomRotation(45),
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),
        transforms.Grayscale(),
        transforms.ToTensor(),
    ])
    dessins_train_data = datasets.ImageFolder(
        root='data/kaggle_dessins/train/', transform=data_transform)
    dessins_valid_data = datasets.ImageFolder(
        root='data/kaggle_dessins/valid/', transform=data_transform)
    train_ds = torch.utils.data.DataLoader(dessins_train_data,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           num_workers=0)
    valid_ds = torch.utils.data.DataLoader(dessins_valid_data,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           num_workers=0)
    mlp = MLP(input_size=input_size,
              input_shape=input_shape,
              indices_names=list(range(len(train_labels))),
              num_classes=len(train_labels_set),
              h_dims=h_dims,
              extra_class=extra_class,
              l1=l1,
              l2=l2,
              batch_norm=True)

    mlp.x_train = train_ds
    mlp.x_valid = valid_ds
    mlp.x_test = None

    mlp.batch_size = batch_size
    mlp.labels_set = os.listdir('data/kaggle_dessins/train/')
    mlp.num_classes = len(mlp.labels_set)

    mlp.labels_train = train_labels
    mlp.labels = train_labels
    mlp.labels_set = train_labels_set

    mlp.set_configs(home_path=home_path,
                    results_folder=results_folder,
                    data_folder=data_folder,
                    destination_folder=destination_folder,
                    dataset_name=dataset_name,
                    lr=lr,
                    meta_destination_folder="meta_pandas_dataframes",
                    csv_filename="csv_loggers",
                    is_unlabelled=False)
    mlp.make_loaders(train_ds=train_ds,
                     valid_ds=valid_ds,
                     test_ds=None,
                     labels_per_class=-1,
                     unlabelled_train_ds=None,
                     unlabelled_samples=True)

    train_total_loss_histories = [[] for x in range(10)]
    train_accuracy_histories = [[] for x in range(10)]
    valid_total_loss_histories = [[] for x in range(10)]
    valid_accuracy_histories = [[] for x in range(10)]
    mlp.train_loader = mlp.train_loader.dataset
    mlp.valid_loader = mlp.valid_loader.dataset
    for i in range(10):
        print("Random train/valid split", i)
        mlp.set_data(labels_per_class=-1,
                     is_example=False,
                     is_split=False,
                     extra_class=extra_class,
                     is_custom_data=True)
        mlp.glorot_init()
        mlp.run(n_epochs,
                verbose=3,
                show_progress=10,
                hist_epoch=20,
                is_balanced_relu=False,
                all0=False)