Exemplo n.º 1
0
 def test_save_smiles_splits(self):
     try:
         self.args.save_smiles_splits = True
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
     except:
         self.fail('save smiles splits')
Exemplo n.º 2
0
 def test_bias(self):
     try:
         self.args.bias = True
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
     except:
         self.fail('bias')
Exemplo n.º 3
0
 def test_show_individual_scores(self):
     try:
         self.args.show_individual_scores = True
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
     except:
         self.fail('show_individual_scores')
Exemplo n.º 4
0
 def test_rdkit_2d_features_unnormalized(self):
     try:
         self.args.features_generator = ['rdkit_2d']
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
     except:
         self.fail('rdkit_2d_features_unnormalized')
Exemplo n.º 5
0
 def test_no_cache(self):
     try:
         self.args.no_cache = True
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
     except:
         self.fail('no_cache')
Exemplo n.º 6
0
 def test_undirected_messages(self):
     try:
         self.args.undirected = True
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
     except:
         self.fail('undirected_messages')
Exemplo n.º 7
0
 def test_activation_prelu(self):
     try:
         self.args.activation = 'PReLU'
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
     except:
         self.fail('activation_prelu')
Exemplo n.º 8
0
    def test_hyperopt(self):
        try:
            parser = ArgumentParser()
            add_train_args(parser)
            parser.add_argument('--num_iters', type=int, default=20,
                                help='Number of hyperparameter choices to try')
            parser.add_argument('--config_save_path', type=str,
                                help='Path to .json file where best hyperparameter settings will be written')
            parser.add_argument('--log_dir', type=str,
                                help='(Optional) Path to a directory where all results of the hyperparameter optimization will be written')
            args = parser.parse_args([])
            args.data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'delaney_toy.csv')
            args.dataset_type = 'regression'
            args.batch_size = 2
            args.hidden_size = 5
            args.epochs = 1
            args.quiet = True
            temp_file = NamedTemporaryFile()
            args.config_save_path = temp_file.name
            args.num_iters = 3
            modify_train_args(args)

            grid_search(args)
            clear_cache()
        except:
            self.fail('hyperopt')
Exemplo n.º 9
0
 def test_atom_messages(self):
     try:
         self.args.atom_messages = True
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
     except:
         self.fail('atom_messages')
Exemplo n.º 10
0
 def test_config(self):
     try:
         self.args.config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.json')
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
     except:
         self.fail('config')
Exemplo n.º 11
0
    def setUp(self):
        parser = ArgumentParser()
        add_train_args(parser)
        args = parser.parse_args([])
        args.data_path = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), 'delaney_toy.csv')
        args.dataset_type = 'regression'
        args.batch_size = 2
        args.hidden_size = 5
        args.epochs = 1
        args.quiet = True
        self.temp_dir = TemporaryDirectory()
        args.save_dir = self.temp_dir.name
        logger = create_logger(name='train',
                               save_dir=args.save_dir,
                               quiet=args.quiet)
        modify_train_args(args)
        cross_validate(args, logger)
        clear_cache()

        parser = ArgumentParser()
        add_predict_args(parser)
        args = parser.parse_args([])
        args.batch_size = 2
        args.checkpoint_dir = self.temp_dir.name
        args.preds_path = NamedTemporaryFile().name
        args.test_path = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            'delaney_toy_smiles.csv')
        self.args = args
Exemplo n.º 12
0
 def test_scaffold(self):
     try:
         self.args.split_type = 'scaffold_balanced'
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
     except:
         self.fail('scaffold')
Exemplo n.º 13
0
def run_comparison(experiment_args: Namespace,
                   logger: logging.Logger,
                   features_dir: str = None):
    for dataset_name in experiment_args.datasets:
        dataset_type, dataset_path, num_folds, metric = DATASETS[dataset_name]
        logger.info(dataset_name)

        # Set up args
        args = deepcopy(experiment_args)
        args.data_path = dataset_path
        args.dataset_type = dataset_type
        args.save_dir = os.path.join(args.save_dir, dataset_name)
        args.num_folds = num_folds
        args.metric = metric
        if features_dir is not None:
            args.features_path = [
                os.path.join(features_dir, dataset_name + '.pckl')
            ]
        modify_train_args(args)

        # Set up logging for training
        os.makedirs(args.save_dir, exist_ok=True)
        fh = logging.FileHandler(os.path.join(args.save_dir, args.log_name))
        fh.setLevel(logging.DEBUG)

        # Cross validate
        TRAIN_LOGGER.addHandler(fh)
        mean_score, std_score = cross_validate(args, TRAIN_LOGGER)
        TRAIN_LOGGER.removeHandler(fh)

        # Record results
        logger.info(f'{mean_score} +/- {std_score} {metric}')
        temp_model = build_model(args)
        logger.info(f'num params: {param_count(temp_model):,}')
Exemplo n.º 14
0
 def test_classification_multiclass_default(self):
     try:
         self.args.data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tox21_toy.csv')
         self.args.dataset_type = 'classification'
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
     except:
         self.fail('classification_default')
Exemplo n.º 15
0
 def test_rdkit_2d_features(self):
     try:
         self.args.features_generator = ['rdkit_2d_normalized']
         self.args.no_features_scaling = True
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
     except:
         self.fail('rdkit_2d_features')
Exemplo n.º 16
0
 def test_features_path(self):
     try:
         self.args.features_path = [os.path.join(os.path.dirname(os.path.abspath(__file__)), 'delaney_toy_features.npz')]
         self.args.no_features_scaling = True
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
     except:
         self.fail('features_path')
Exemplo n.º 17
0
 def test_features_only(self):
     try:
         self.args.features_generator = ['morgan']
         self.features_only = True
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
     except:
         self.fail('features_only')
Exemplo n.º 18
0
 def test_num_folds_ensemble(self):
     try:
         self.args.num_folds = 2
         self.args.ensemble_size = 2
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
     except:
         self.fail('num_folds_ensemble')
Exemplo n.º 19
0
 def test_predetermined_split(self):
     try:
         self.args.split_type = 'predetermined'
         self.args.folds_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'delaney_toy_folds.pkl')
         self.args.val_fold_index = 1
         self.args.test_fold_index = 2
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
     except:
         self.fail('predetermined_split')
Exemplo n.º 20
0
 def test_checkpoint(self):
     try:
         args_copy = deepcopy(self.args)
         temp_dir = TemporaryDirectory()
         self.args.save_dir = temp_dir.name
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
         args_copy.checkpoint_dir = temp_dir.name
         args_copy.test = True
         modify_train_args(args_copy)
         cross_validate(args_copy, self.logger)
     except:
         self.fail('checkpoint')
Exemplo n.º 21
0
    model = load_checkpoint(os.path.join(save_dir, 'model.pt'), cuda=args.cuda)
    test_smiles, test_targets = test_data.smiles(), test_data.targets()
    test_preds = predict(model, test_data, args.batch_size)
    test_scores = evaluate_predictions(test_preds, test_targets,
                                       args.num_tasks, metric_func,
                                       args.dataset_type)

    avg_test_score = np.nanmean(test_scores)
    print(f'Test {args.metric} = {avg_test_score:.4f}')
    return avg_test_score


if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument('--source_data_path', required=True)
    parser.add_argument('--src_batch_size', type=int, default=100)
    parser.add_argument('--lambda_e', type=float, default=0.1)

    add_train_args(parser)
    args = parser.parse_args()
    modify_train_args(args)

    all_test_score = np.zeros((args.num_folds, ))
    for i in range(args.num_folds):
        fold_dir = os.path.join(args.save_dir, f'fold_{i}')
        makedirs(fold_dir)
        all_test_score[i] = run_training(args, fold_dir)

    mean, std = np.mean(all_test_score), np.std(all_test_score)
    print(f'{args.num_folds} fold average: {mean:.4f} +/- {std:.4f}')
Exemplo n.º 22
0
def train():
    global training_message
    if request.method == 'GET':
        return render_template('train.html',
                               datasets=get_datasets(),
                               started=False,
                               cuda=app.config['CUDA'],
                               gpus=app.config['GPUS'])

    # Get arguments
    data_name, epochs, checkpoint_name = \
        request.form['dataName'], int(request.form['epochs']), request.form['checkpointName']
    gpu = request.form.get('gpu', None)
    dataset_type = request.form.get('datasetType', 'regression')

    if not checkpoint_name.endswith('.pt'):
        checkpoint_name += '.pt'

    # Create and modify args
    parser = ArgumentParser()
    add_train_args(parser)
    args = parser.parse_args()

    args.data_path = os.path.join(app.config['DATA_FOLDER'], data_name)
    args.dataset_type = dataset_type
    args.epochs = epochs

    target_set, all_targets_have_labels, has_invalid_targets = get_target_set(
        args.data_path)
    if len(target_set) == 0:
        return render_template('train.html',
                               datasets=get_datasets(),
                               started=False,
                               cuda=app.config['CUDA'],
                               gpus=app.config['GPUS'],
                               error="No training labels provided")
    if has_invalid_targets:
        return render_template('train.html',
                               datasets=get_datasets(),
                               started=False,
                               cuda=app.config['CUDA'],
                               gpus=app.config['GPUS'],
                               error="Training data contains invalid labels")
    classification_on_regression_dataset = ((not target_set <= set([0, 1]))
                                            and args.dataset_type
                                            == 'classification')
    if classification_on_regression_dataset:
        return render_template(
            'train.html',
            datasets=get_datasets(),
            started=False,
            cuda=app.config['CUDA'],
            gpus=app.config['GPUS'],
            error=
            'Selected classification dataset, but not all labels are 0 or 1')
    regression_on_classification_dataset = (target_set <= set([0, 1]) and
                                            args.dataset_type == 'regression')
    if not all_targets_have_labels:
        training_message += 'One or more targets have no labels. \n'  # TODO could have separate warning messages for each?
    if regression_on_classification_dataset:
        training_message += 'All labels are 0 or 1; did you mean to train classification instead of regression?\n'

    if gpu is not None:
        if gpu == 'None':
            args.no_cuda = True
        else:
            args.gpu = int(gpu)

    with TemporaryDirectory() as temp_dir:
        args.save_dir = temp_dir
        modify_train_args(args)
        if os.path.isdir(args.save_dir):
            training_message += 'Overwriting preexisting checkpoint with the same name.'
        logger = logging.getLogger('train')
        logger.setLevel(logging.DEBUG)
        logger.propagate = False
        set_logger(logger, args.save_dir, args.quiet)

        global progress
        process = mp.Process(target=progress_bar, args=(args, progress))
        process.start()
        global started
        started = 1
        # Run training
        run_training(args, logger)
        process.join()

        # reset globals
        started = 0
        progress = mp.Value('d', 0.0)

        # Move checkpoint
        shutil.move(
            os.path.join(args.save_dir, 'model_0', 'model.pt'),
            os.path.join(app.config['CHECKPOINT_FOLDER'], checkpoint_name))

    warning = training_message if len(training_message) > 0 else None
    training_message = ""
    return render_template('train.html',
                           datasets=get_datasets(),
                           cuda=app.config['CUDA'],
                           gpus=app.config['GPUS'],
                           trained=True,
                           warning=warning)
Exemplo n.º 23
0
def train():
    global progress, training

    warnings, errors = [], []

    if request.method == 'GET':
        return render_train()

    # Get arguments
    data_name, epochs, checkpoint_name = \
        request.form['dataName'], int(request.form['epochs']), request.form['checkpointName']
    gpu = request.form.get('gpu')
    data_path = os.path.join(app.config['DATA_FOLDER'], data_name)
    dataset_type = request.form.get('datasetType', 'regression')

    if not checkpoint_name.endswith('.pt'):
        checkpoint_name += '.pt'

    # Create and modify args
    parser = ArgumentParser()
    add_train_args(parser)
    args = parser.parse_args()

    args.data_path = data_path
    args.dataset_type = dataset_type
    args.epochs = epochs

    # Check if regression/classification selection matches data
    data = get_data(path=data_path)
    targets = data.targets()
    unique_targets = set(np.unique(targets))

    if dataset_type == 'classification' and len(unique_targets - {0, 1}) > 0:
        errors.append(
            'Selected classification dataset but not all labels are 0 or 1. Select regression instead.'
        )

        return render_train(warnings=warnings, errors=errors)

    if dataset_type == 'regression' and unique_targets <= {0, 1}:
        errors.append(
            'Selected regression dataset but all labels are 0 or 1. Select classification instead.'
        )

        return render_train(warnings=warnings, errors=errors)

    if gpu is not None:
        if gpu == 'None':
            args.no_cuda = True
        else:
            args.gpu = int(gpu)

    with TemporaryDirectory() as temp_dir:
        args.save_dir = temp_dir
        modify_train_args(args)

        logger = logging.getLogger('train')
        logger.setLevel(logging.DEBUG)
        logger.propagate = False
        set_logger(logger, args.save_dir, args.quiet)

        process = mp.Process(target=progress_bar, args=(args, progress))
        process.start()
        training = 1

        # Run training
        task_scores = run_training(args, logger)
        process.join()

        # Reset globals
        training = 0
        progress = mp.Value('d', 0.0)

        # Check if name overlap
        original_save_path = os.path.join(app.config['CHECKPOINT_FOLDER'],
                                          checkpoint_name)
        save_path = find_unique_path(original_save_path)
        if save_path != original_save_path:
            warnings.append(
                name_already_exists_message('Checkpoint', original_save_path,
                                            save_path))

        # Move checkpoint
        shutil.move(os.path.join(args.save_dir, 'model_0', 'model.pt'),
                    save_path)

    return render_train(trained=True,
                        metric=args.metric,
                        num_tasks=len(args.task_names),
                        task_names=args.task_names,
                        task_scores=format_float_list(task_scores),
                        mean_score=format_float(np.mean(task_scores)),
                        warnings=warnings,
                        errors=errors)
Exemplo n.º 24
0
def grid_search(args: Namespace):
    for dataset_name in args.datasets:
        # Get dataset
        dataset_type, dataset_path, _, metric = DATASETS[dataset_name]

        # Create logger for dataset
        logger = create_logger(name=dataset_name,
                               save_dir=args.save_dir,
                               save_name='{}_{}.log'.format(
                                   dataset_name, args.split_type))

        # Set up args for dataset
        dataset_args = deepcopy(args)
        dataset_args.data_path = dataset_path
        dataset_args.dataset_type = dataset_type
        dataset_args.save_dir = None
        dataset_args.metric = metric
        modify_train_args(dataset_args)

        # Run grid search
        results = []

        # Define hyperparameter optimization
        def objective(hyperparams: Dict[str, Union[int, float]]) -> float:
            # Convert hyperparms from float to int when necessary
            for key in INT_KEYS:
                hyperparams[key] = int(hyperparams[key])

            # Copy args
            gs_args = deepcopy(dataset_args)

            for key, value in hyperparams.items():
                setattr(gs_args, key, value)

            # Record hyperparameters
            logger.info(hyperparams)

            # Cross validate
            mean_score, std_score = cross_validate(gs_args, TRAIN_LOGGER)

            # Record results
            temp_model = build_model(gs_args)
            num_params = param_count(temp_model)
            logger.info('num params: {:,}'.format(num_params))
            logger.info('{} +/- {} {}'.format(mean_score, std_score, metric))

            results.append({
                'mean_score': mean_score,
                'std_score': std_score,
                'hyperparams': hyperparams,
                'num_params': num_params
            })

            # Deal with nan
            if np.isnan(mean_score):
                if gs_args.dataset_type == 'classification':
                    mean_score = 0
                else:
                    raise ValueError(
                        'Can\'t handle nan score for non-classification dataset.'
                    )

            return (1 if gs_args.minimize_score else -1) * mean_score

        fmin(objective,
             SPACE,
             algo=tpe.suggest,
             max_evals=args.num_runs_per_dataset)

        # Report best result
        results = [
            result for result in results if not np.isnan(result['mean_score'])
        ]
        best_result = min(
            results,
            key=lambda result:
            (1 if dataset_args.minimize_score else -1) * result['mean_score'])
        logger.info('best')
        logger.info(best_result['hyperparams'])
        logger.info('num params: {:,}'.format(best_result['num_params']))
        logger.info('{} +/- {} {}'.format(best_result['mean_score'],
                                          best_result['std_score'], metric))
Exemplo n.º 25
0
    # Report best result
    results = [result for result in results if not np.isnan(result['mean_score'])]
    best_result = min(results, key=lambda result: (1 if args.minimize_score else -1) * result['mean_score'])
    logger.info('best')
    logger.info(best_result['hyperparams'])
    logger.info(f'num params: {best_result["num_params"]:,}')
    logger.info(f'{best_result["mean_score"]} +/- {best_result["std_score"]} {args.metric}')

    # Save best hyperparameter settings as JSON config file
    makedirs(args.config_save_path, isfile=True)

    with open(args.config_save_path, 'w') as f:
        json.dump(best_result['hyperparams'], f, indent=4, sort_keys=True)


if __name__ == '__main__':
    parser = ArgumentParser()
    add_train_args(parser)
    parser.add_argument('--num_iters', type=int, default=20,
                        help='Number of hyperparameter choices to try')
    parser.add_argument('--config_save_path', type=str, required=True,
                        help='Path to .json file where best hyperparameter settings will be written')
    parser.add_argument('--log_dir', type=str,
                        help='(Optional) Path to a directory where all results of the hyperparameter optimization will be written')
    temp_input = '--data_path data/bbbp.csv --dataset_type classification --save_dir log/bbbp/model --gpu 0 --num_folds 10 --features_generator rdkit_2d_normalized --no_features_scaling --config_save_path log/bbbp/config --log_dir log/bbbp/temp'
    #'--data_path data/tox21.csv --dataset_type classification --save_dir log/tox21_checkpoints --gpu 0 --num_iters 20 --config_save_path log/best_json --log_dir log/temp'
    args = parser.parse_args(temp_input.split())
    modify_train_args(args)  # 对输入的参数进行调整

    grid_search(args)
Exemplo n.º 26
0
def train():
    """Renders the train page and performs training if request method is POST."""
    global PROGRESS, TRAINING

    warnings, errors = [], []

    if request.method == 'GET':
        return render_train()

    # Get arguments
    data_name, epochs, ensemble_size, checkpoint_name = \
        request.form['dataName'], int(request.form['epochs']), \
        int(request.form['ensembleSize']), request.form['checkpointName']
    gpu = request.form.get('gpu')
    data_path = os.path.join(app.config['DATA_FOLDER'], f'{data_name}.csv')
    dataset_type = request.form.get('datasetType', 'regression')

    # Create and modify args
    parser = ArgumentParser()
    add_train_args(parser)
    args = parser.parse_args([])

    args.data_path = data_path
    args.dataset_type = dataset_type
    args.epochs = epochs
    args.ensemble_size = ensemble_size

    # Check if regression/classification selection matches data
    data = get_data(path=data_path)
    targets = data.targets()
    unique_targets = {target for row in targets for target in row if target is not None}

    if dataset_type == 'classification' and len(unique_targets - {0, 1}) > 0:
        errors.append('Selected classification dataset but not all labels are 0 or 1. Select regression instead.')

        return render_train(warnings=warnings, errors=errors)

    if dataset_type == 'regression' and unique_targets <= {0, 1}:
        errors.append('Selected regression dataset but all labels are 0 or 1. Select classification instead.')

        return render_train(warnings=warnings, errors=errors)

    if gpu is not None:
        if gpu == 'None':
            args.no_cuda = True
        else:
            args.gpu = int(gpu)

    current_user = request.cookies.get('currentUser')

    if not current_user:
        # Use DEFAULT as current user if the client's cookie is not set.
        current_user = app.config['DEFAULT_USER_ID']

    ckpt_id, ckpt_name = db.insert_ckpt(checkpoint_name,
                                        current_user,
                                        args.dataset_type,
                                        args.epochs,
                                        args.ensemble_size,
                                        len(targets))

    with TemporaryDirectory() as temp_dir:
        args.save_dir = temp_dir
        modify_train_args(args)

        process = mp.Process(target=progress_bar, args=(args, PROGRESS))
        process.start()
        TRAINING = 1

        # Run training
        logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet)
        task_scores = run_training(args, logger)
        process.join()

        # Reset globals
        TRAINING = 0
        PROGRESS = mp.Value('d', 0.0)

        # Check if name overlap
        if checkpoint_name != ckpt_name:
            warnings.append(name_already_exists_message('Checkpoint', checkpoint_name, ckpt_name))

        # Move models
        for root, _, files in os.walk(args.save_dir):
            for fname in files:
                if fname.endswith('.pt'):
                    model_id = db.insert_model(ckpt_id)
                    save_path = os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model_id}.pt')
                    shutil.move(os.path.join(args.save_dir, root, fname), save_path)

    return render_train(trained=True,
                        metric=args.metric,
                        num_tasks=len(args.task_names),
                        task_names=args.task_names,
                        task_scores=format_float_list(task_scores),
                        mean_score=format_float(np.mean(task_scores)),
                        warnings=warnings,
                        errors=errors)
Exemplo n.º 27
0
 def test_regression_default(self):
     try:
         modify_train_args(self.args)
         cross_validate(self.args, self.logger)
     except:
         self.fail('regression_default')