def __main__(): import os os.chdir("..") # To return at the root of the project from models.discriminative.artificial_neural_networks.MultiLayerPerceptron import MLP geo_ids = ["GSE33000"] unlabelled_geo_ids = ["GSE33000"] load_from_disk = True load_merge = False home_path = "/home/simon/" destination_folder = "annleukemia" data_folder = "data" results_folder = "results" translate = "f" extra_class = True # TODO change to put the number... curious to see if more than one is desirable meta_destination_folder = "pandas_meta_df" plots_folder_path = "/".join( [home_path, destination_folder, results_folder, "plots/"]) dataset_name = "gse33000" activation = "relu" early_stopping = 200 n_epochs = 1000 gt_input = 0 use_conv = False # Not applicable if not sequence (images, videos, sentences, DNA...) lr = 1e-3 l1 = 0. l2 = 0. dropout = 0.5 batch_size = 32 is_pruning = False # mc = 1 # iw = 1 # Neurons layers h_dims = [128, 128] from utils.utils import adapt_datasets g = GeoParser(home_path=home_path, geo_ids=geo_ids) g.get_geo(load_from_disk=load_from_disk, automatic_attribute_list=None) meta_df = g.merge_datasets(load_from_disk=load_merge, labelled=True) labels = set(list(meta_df.columns)) mlp = MLP(input_size=meta_df.shape[0], input_shape=(meta_df.shape[0]), indices_names=list(range(meta_df.shape[0])), num_classes=len(labels), h_dims=h_dims, extra_class=extra_class, l1=l1, l2=l2, batch_norm=True) mlp.labels = labels mlp.labels_set = list(set(labels)) mlp.set_configs(home_path=home_path, results_folder=results_folder, data_folder=data_folder, destination_folder=destination_folder, dataset_name=dataset_name, lr=lr, meta_destination_folder="meta_pandas_dataframes", csv_filename="csv_loggers", is_unlabelled=False) print("Labeled data shape (35371, 624)", meta_df.shape) if meta_df is not None: mlp.import_dataframe(meta_df, batch_size, labelled=True) train_total_loss_histories = [[] for x in range(10)] train_accuracy_histories = [[] for x in range(10)] valid_total_loss_histories = [[] for x in range(10)] valid_accuracy_histories = [[] for x in range(10)] for i in range(10): print("Random train/valid split", i) mlp.set_data(labels_per_class=-1, is_example=False, extra_class=extra_class) mlp.glorot_init() mlp.run(n_epochs, verbose=2, show_progress=10, hist_epoch=20, is_balanced_relu=False, all0=False)
def __main__(): local_folder = "./data/kaggle_dessins/" train_images_fname = "train_images.npy" train_labels_fname = "train_labels.csv" home_path = "/home/simon/" destination_folder = "annleukemia" data_folder = "data" results_folder = "results" extra_class = True # TODO change to put the number... curious to see if more than one is desirable meta_destination_folder = "pandas_meta_df" plots_folder_path = "/".join([home_path, destination_folder, results_folder, "plots/"]) dataset_name = "dessins" activation = "relu" early_stopping = 200 n_epochs = 1000 gt_input = 0 use_conv = False # Not applicable if not sequence (images, videos, sentences, DNA...) lr = 1e-5 l1 = 1e-5 l2 = 1e-10 dropout = 0.5 batch_size = 16 is_pruning = False # mc = 1 # iw = 1 # Neurons layers h_dims = [1024, 1024, 1024] from utils.utils import adapt_datasets train_arrays = np.load(local_folder + train_images_fname, encoding="latin1") train_dataset = np.vstack(train_arrays[:, 1]) train_labels = genfromtxt(local_folder + train_labels_fname, delimiter=",", dtype=str, skip_header=True)[:, 1] test_dataset = np.vstack(np.load(local_folder + "test_images.npy", encoding="latin1")[:, 1]) meta_df = pd.DataFrame(train_dataset, columns=train_labels) img_shape = [1, 100, 100] labels = set(list(meta_df.columns)) mlp = MLP(input_size=meta_df.shape[0], input_shape=(meta_df.shape[0]), indices_names=list(range(meta_df.shape[0])), num_classes=len(labels), h_dims=h_dims, extra_class=extra_class, l1=l1, l2=l2, batch_norm=True) mlp.labels = labels mlp.labels_set = list(set(labels)) mlp.set_configs(home_path=home_path, results_folder=results_folder, data_folder=data_folder, destination_folder=destination_folder, dataset_name=dataset_name, lr=lr, meta_destination_folder="meta_pandas_dataframes", csv_filename="csv_loggers", is_unlabelled=False) print("Labeled data shape (35371, 624)", meta_df.shape) if meta_df is not None: mlp.import_dataframe(meta_df, batch_size, labelled=True) train_total_loss_histories = [[] for x in range(10)] train_accuracy_histories = [[] for x in range(10)] valid_total_loss_histories = [[] for x in range(10)] valid_accuracy_histories = [[] for x in range(10)] for i in range(10): print("Random train/valid split", i) mlp.set_data(labels_per_class=-1, is_example=False, extra_class=extra_class) mlp.glorot_init() mlp.run(n_epochs, verbose=3, show_progress=10, hist_epoch=20, is_balanced_relu=False, all0=False)
def __main__(): train_labels_fname = "train_labels.csv" home_path = "/home/simon/" destination_folder = "annleukemia" data_folder = "data" results_folder = "results" extra_class = True # TODO change to put the number... curious to see if more than one is desirable dataset_name = "dessins" n_epochs = 1000 lr = 1e-3 l1 = 0. l2 = 0. batch_size = 256 resized_shape = 100 # Neurons layers h_dims = [1024, 512, 256, 128] input_shape = [1, 100, 100] input_size = np.prod([1, resized_shape, resized_shape]) dir = "data/kaggle_dessins/" train_labels = genfromtxt(dir + train_labels_fname, delimiter=",", dtype=str, skip_header=True)[:, 1] train_labels_set = set(train_labels) data_transform = transforms.Compose([ transforms.RandomSizedCrop((resized_shape)), transforms.RandomRotation(45), transforms.RandomHorizontalFlip(), transforms.RandomVerticalFlip(), transforms.Grayscale(), transforms.ToTensor(), ]) dessins_train_data = datasets.ImageFolder( root='data/kaggle_dessins/train/', transform=data_transform) dessins_valid_data = datasets.ImageFolder( root='data/kaggle_dessins/valid/', transform=data_transform) train_ds = torch.utils.data.DataLoader(dessins_train_data, batch_size=batch_size, shuffle=True, num_workers=0) valid_ds = torch.utils.data.DataLoader(dessins_valid_data, batch_size=batch_size, shuffle=True, num_workers=0) mlp = MLP(input_size=input_size, input_shape=input_shape, indices_names=list(range(len(train_labels))), num_classes=len(train_labels_set), h_dims=h_dims, extra_class=extra_class, l1=l1, l2=l2, batch_norm=True) mlp.x_train = train_ds mlp.x_valid = valid_ds mlp.x_test = None mlp.batch_size = batch_size mlp.labels_set = os.listdir('data/kaggle_dessins/train/') mlp.num_classes = len(mlp.labels_set) mlp.labels_train = train_labels mlp.labels = train_labels mlp.labels_set = train_labels_set mlp.set_configs(home_path=home_path, results_folder=results_folder, data_folder=data_folder, destination_folder=destination_folder, dataset_name=dataset_name, lr=lr, meta_destination_folder="meta_pandas_dataframes", csv_filename="csv_loggers", is_unlabelled=False) mlp.make_loaders(train_ds=train_ds, valid_ds=valid_ds, test_ds=None, labels_per_class=-1, unlabelled_train_ds=None, unlabelled_samples=True) train_total_loss_histories = [[] for x in range(10)] train_accuracy_histories = [[] for x in range(10)] valid_total_loss_histories = [[] for x in range(10)] valid_accuracy_histories = [[] for x in range(10)] mlp.train_loader = mlp.train_loader.dataset mlp.valid_loader = mlp.valid_loader.dataset for i in range(10): print("Random train/valid split", i) mlp.set_data(labels_per_class=-1, is_example=False, is_split=False, extra_class=extra_class, is_custom_data=True) mlp.glorot_init() mlp.run(n_epochs, verbose=3, show_progress=10, hist_epoch=20, is_balanced_relu=False, all0=False)