if args.max_num_item is not None: args.max_num_item = int(args.max_num_item) else: args.max_num_person = None args.max_num_item = None device = torch.device("cuda" if args.cuda else "cpu") if args.cuda: torch.cuda.set_device(args.gpu_device) train_dataset = load_dataset( args.dataset, train=True, num_person=args.num_person, num_item=args.num_item, ability_dim=args.ability_dim, max_num_person=args.max_num_person, max_num_item=args.max_num_item, ) if args.artificial_missing_perc > 0: train_dataset = artificially_mask_dataset( train_dataset, args.artificial_missing_perc, ) num_person = train_dataset.num_person num_item = train_dataset.num_item train_loader = torch.utils.data.DataLoader(
if not os.path.isdir(args.out_dir): os.makedirs(args.out_dir) device = torch.device("cuda" if args.cuda else "cpu") if args.cuda: torch.cuda.set_device(args.gpu_device) if args.response_dist == 'bernoulli': dataset_name = args.dataset else: dataset_name = f'{args.dataset}_continuous' train_dataset = load_dataset( dataset_name, train = True, num_person = args.num_person, num_item = args.num_item, ability_dim = args.ability_dim, max_num_person = args.max_num_person, max_num_item = args.max_num_item, ) test_dataset = load_dataset( dataset_name, train = False, num_person = args.num_person, num_item = args.num_item, ability_dim = args.ability_dim, max_num_person = args.max_num_person, max_num_item = args.max_num_item, ) if args.artificial_missing_perc > 0:
def load(data_dir, config, splits): """ Load specific dataset. Args: data_dir (str): path to the dataset directory. config (dict): general dict with settings. splits (list): list of strings 'train'|'val'|'test'. Returns (dict): dictionary with keys 'train'|'val'|'test'| and values as tensorflow Dataset objects. """ dataset_path = '/tf/data/{}'.format(config['data.dataset']) if not os.path.exists(dataset_path): os.makedirs(dataset_path) data = load_dataset(config, datagen_flow=True, with_datasets=True) ret = {} for split in splits: # n_way (number of classes per episode) if split in ['val', 'test']: n_way = config['data.test_way'] else: n_way = config['data.train_way'] # n_support (number of support samples per class) if split in ['val', 'test']: n_support = config['data.test_support'] else: n_support = config['data.train_support'] # n_query (number of query samples per class) if split in ['val', 'test']: n_query = config['data.test_query'] else: n_query = config['data.train_query'] batch_size = config['data.batch_size'] split_size = data[f"{split}_size"] x, y = data[f"{split}_gen"].next() batches = 1 for images, labels in data[f"{split}_gen"]: x = np.concatenate([x, images]) y = np.concatenate([y, labels]) batches += 1 if batches >= split_size / batch_size: # we need to break the loop by hand because # the generator loops indefinitely break i = np.argsort(y) y = y[i] x = x[i, :, :, :] split_data = [[] for i in range(data["nb_classes"])] for index in i: split_data[y[index]].append(x[index]) data_loader = DataLoader(np.array( [np.array(images) for images in split_data]), n_classes=data["nb_classes"], n_way=n_way, n_support=n_support, n_query=n_query, x_dim=data["image_shape"]) ret[split] = data_loader return ret
import torch import jsonlines from tqdm import tqdm from argparse import ArgumentParser from pathlib import Path from src.datasets import load_dataset if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--checkpoint', type=Path) parser.add_argument('--dataset', type=str, default='squad') args = parser.parse_args() dataset = load_dataset(args.dataset, train=True) saved_model = torch.load(args.checkpoint / 'checkpoint.pth.tar') weights = saved_model['model_state_dict'] models, items = [], [] for i, model in tqdm(enumerate(dataset.ix_to_model)): models.append({ 'submission_id': model, 'ability_mu': weights['ability_mu_lookup.weight'][i].tolist(), 'ability_logvar': weights['ability_logvar_lookup.weight'][i].tolist() })
def run_experiments( args: argparse.Namespace, neptuneai_project_id: str = 'clfmsc2020/experiments') -> None: """ Runs experiments """ def _debug(text: str) -> None: """ Prints statemets only if args.debug or arg.verbose flag is set """ if args.debug or args.verbose: print(f'[INFO] {text}') if args.useneptune: _debug('Neptune.AI enabled.') neptune.init(neptuneai_project_id) _debug(f'Config file path: {args.config}') data_yaml_params, knn_yaml_params, exp_yaml_params = \ helpers.load_config_file(args.config) device = 'cuda:0' if torch.cuda.is_available() else 'cpu' _debug(f'Device: {device}') # -- LEVEL 0: CACHE DATASET for data_params in utils.iterparams(data_yaml_params): x, y = datasets.load_dataset(data_params.DATASET, transform, use_umap=data_params.USE_UMAP) (train_x, train_y), (val_x, val_y), (test_x, test_y) = utils.split_data( x, y, val_size=data_params.VAL_SIZE, test_size=data_params.TEST_SIZE) _debug(f'Dataset: {data_params.DATASET}\n') # TODO: output_shape = ... # -- LEVEL 1: CACHE KNN for knn_params in utils.iterparams(knn_yaml_params): knn = None if knn_params.K is not None and knn_params.K > 0: train_val_x = np.concatenate((train_x, val_x), axis=0) train_val_y = np.concatenate((train_y, val_y), axis=0) knn = FaissKNN(train_val_x, train_val_y, precompute=True, k=knn_params.K) _debug(f'kNN wrapper initialized (k = {knn_params.K}).\n') # -- LEVEL 2: RUN EXPERIMENTS for exp_params in utils.iterparams(exp_yaml_params): # EXCEPTIONS if exp_params.FUNCTION_NAME == 'bce' and \ exp_params.OUTPUT_ACTIVATIONS == 'tanh': _debug('An exception has occurred (BCE + TanH)') continue # Criterion criterion = None hinge_target_range = False criterion_name = exp_params.FUNCTION_NAME criterion_type = helpers.LossFuncType.from_string( exp_params.FUNCTION_TYPE) n_layers = len(exp_params.LAYERS) _debug(f'Criterion: {criterion_name} (type: {criterion_type})') if criterion_type == ftype.BASIC: hinge_target_range, loss_function = helpers.get_loss_function( criterion_name, criterion_type) criterion = loss_function() elif criterion_type == ftype.ENTR_R: assert knn is not None, 'kNN wrapper is not initialized!' hinge_target_range, base_loss = helpers.get_loss_function( criterion_name, ftype.BASIC) criterion = lossfunc.EntropyRegularizedBinaryLoss( base_loss(), knn) elif criterion_type == ftype.ENTR_W: assert knn is not None, 'kNN wrapper is not initialized!' hinge_target_range, base_loss = helpers.get_loss_function( criterion_name, ftype.BASIC) criterion = lossfunc.EntropyWeightedBinaryLoss( base_loss(), knn) elif criterion_type == ftype.CLF: assert knn is not None, 'kNN wrapper is not initialized!' hinge_target_range, loss_function = helpers.get_loss_function( criterion_name, ftype.CLF) criterion = loss_function( knn, 0.5) # FIXME: Fixed params (alpha, beta) assert criterion is not None, 'Criterion was not initialized!' # Change target range target_train_y = np.copy(train_y) target_val_y = np.copy(val_y) target_test_y = np.copy(test_y) if hinge_target_range: _debug('Negative class: 0 -> -1') target_train_y[train_y == 0] = -1 target_val_y[val_y == 0] = -1 target_test_y[test_y == 0] = -1 # Convert the subsets into DataLoaders train_dataloader = datasets.convert_to_dataloader( train_x, target_train_y, batch_size=data_params.BATCH_SIZE) valid_dataloader = datasets.convert_to_dataloader( val_x, target_val_y, batch_size=data_params.BATCH_SIZE, startidx=train_x.shape[0]) test_data_x, test_data_y = Tensor(test_x), Tensor(test_y) # Prepare the experiment all_params = {**data_params, **knn_params, **exp_params} _debug(f'Params: \n {all_params}') unified_dataset_name = datasets.simplify_dataset_name( data_params.DATASET) experiment_name = f'{unified_dataset_name}_{exp_params.FUNCTION_NAME}_{exp_params.FUNCTION_TYPE}' _debug(f'Experiment name: {experiment_name}') # Set-up neptue.ai experiment experiment = None if args.useneptune: tags = [ exp_params.FUNCTION_NAME, unified_dataset_name, data_params.PROBLEM, exp_params.FUNCTION_TYPE, exp_params.OUTPUT_ACTIVATIONS ] all_params['N_LAYERS'] = n_layers experiment = neptune.create_experiment( name=experiment_name, tags=tags, params=all_params, upload_source_files=[ args.config, 'src/losses/*.py', __file__ ]) # Input shape input_dim = x.shape[1] # Layers predefined_layers = exp_params.LAYERS.copy() if criterion_type == ftype.ENTR_R: predefined_layers.append(2) # Output shape (#FIXME) output_dim = 1 layers = [input_dim] + predefined_layers + [output_dim] model = CustomNeuralNetwork(layers, exp_params.HIDDEN_ACTIVATIONS, exp_params.OUTPUT_ACTIVATIONS) optimizer = Adam(model.parameters(), lr=exp_params.LEARNING_RATE) # Run an experiment _debug('Starting the training ...') logger = neptune if args.useneptune else None model, training_loss_history, validation_loss_history = \ trainingloop.run(experiment_name, optimizer, criterion, model, train_dataloader, exp_params.EPOCHS, valid_dataloader, test_data_x=test_data_x, test_data_y=test_data_y, eval_freq=exp_params.EVAL, early_stopping=exp_params.EARLY_STOPPING, neptune_logger=logger, knn_use_indices=True, loss_type=criterion_type) # Evaluate the model metrics = trainingloop.evaluate_binary(model, test_data_x, test_data_y) _debug(f'Done! Evaluation results: \n{metrics}\n') if args.useneptune: for metric, value in metrics.items(): neptune.log_metric(f'final_{metric}', value) experiment.stop()
args.ability_dim, args.ability_merge, args.num_iafs, ) args.out_dir = os.path.join(args.out_dir, out_file) if not os.path.isdir(args.out_dir): os.makedirs(args.out_dir) device = torch.device("cuda" if args.cuda else "cpu") if args.cuda: torch.cuda.set_device(args.gpu_device) train_dataset = load_dataset( args.dataset, train=True, num_person=args.num_person, num_item=args.num_item, ability_dim=args.ability_dim, ) test_dataset = load_dataset( args.dataset, train=False, num_person=args.num_person, num_item=args.num_item, ability_dim=args.ability_dim, ) num_person = train_dataset.num_person num_item = train_dataset.num_item train_loader = torch.utils.data.DataLoader(