예제 #1
0
def bace_rf_model(mode="classification", verbosity="high", split="20-80"):
    """Train random forests on BACE dataset."""
    (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset,
     transformers) = load_bace(mode=mode, transform=False, split=split)

    if mode == "regression":
        r2_metric = Metric(metrics.r2_score, verbosity=verbosity)
        rms_metric = Metric(metrics.rms_score, verbosity=verbosity)
        mae_metric = Metric(metrics.mae_score, verbosity=verbosity)
        all_metrics = [r2_metric, rms_metric, mae_metric]
        metric = r2_metric
        model_class = RandomForestRegressor

        def rf_model_builder(model_params, model_dir):
            sklearn_model = RandomForestRegressor(**model_params)
            return SklearnModel(sklearn_model, model_dir)
    elif mode == "classification":
        roc_auc_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
        accuracy_metric = Metric(metrics.accuracy_score, verbosity=verbosity)
        mcc_metric = Metric(metrics.matthews_corrcoef, verbosity=verbosity)
        # Note sensitivity = recall
        recall_metric = Metric(metrics.recall_score, verbosity=verbosity)
        model_class = RandomForestClassifier
        all_metrics = [
            accuracy_metric, mcc_metric, recall_metric, roc_auc_metric
        ]
        metric = roc_auc_metric

        def rf_model_builder(model_params, model_dir):
            sklearn_model = RandomForestClassifier(**model_params)
            return SklearnModel(sklearn_model, model_dir)
    else:
        raise ValueError("Invalid mode %s" % mode)

    params_dict = {
        "n_estimators": [10, 100],
        "max_features": ["auto", "sqrt", "log2", None],
    }

    optimizer = HyperparamOpt(rf_model_builder, verbosity="low")
    best_rf, best_rf_hyperparams, all_rf_results = optimizer.hyperparam_search(
        params_dict, train_dataset, valid_dataset, transformers, metric=metric)

    if len(train_dataset) > 0:
        rf_train_evaluator = Evaluator(best_rf,
                                       train_dataset,
                                       transformers,
                                       verbosity=verbosity)
        csv_out = "rf_%s_%s_train.csv" % (mode, split)
        stats_out = "rf_%s_%s_train_stats.txt" % (mode, split)
        rf_train_score = rf_train_evaluator.compute_model_performance(
            all_metrics, csv_out=csv_out, stats_out=stats_out)
        print("RF Train set scores: %s" % (str(rf_train_score)))

    if len(valid_dataset) > 0:
        rf_valid_evaluator = Evaluator(best_rf,
                                       valid_dataset,
                                       transformers,
                                       verbosity=verbosity)
        csv_out = "rf_%s_%s_valid.csv" % (mode, split)
        stats_out = "rf_%s_%s_valid_stats.txt" % (mode, split)
        rf_valid_score = rf_valid_evaluator.compute_model_performance(
            all_metrics, csv_out=csv_out, stats_out=stats_out)
        print("RF Valid set scores: %s" % (str(rf_valid_score)))

    if len(test_dataset) > 0:
        rf_test_evaluator = Evaluator(best_rf,
                                      test_dataset,
                                      transformers,
                                      verbosity=verbosity)
        csv_out = "rf_%s_%s_test.csv" % (mode, split)
        stats_out = "rf_%s_%s_test_stats.txt" % (mode, split)
        rf_test_score = rf_test_evaluator.compute_model_performance(
            all_metrics, csv_out=csv_out, stats_out=stats_out)
        print("RF Test set: %s" % (str(rf_test_score)))

    if len(crystal_dataset) > 0:
        rf_crystal_evaluator = Evaluator(best_rf, crystal_dataset,
                                         transformers, verbosity)
        csv_out = "rf_%s_%s_crystal.csv" % (mode, split)
        stats_out = "rf_%s_%s_crystal_stats.txt" % (mode, split)
        rf_crystal_score = rf_crystal_evaluator.compute_model_performance(
            all_metrics, csv_out=csv_out, stats_out=stats_out)
        print("RF Crystal set: %s" % (str(rf_crystal_score)))
예제 #2
0
def bace_rf_model(mode="classification", verbosity="high", split="20-80"):
  """Train random forests on BACE dataset."""
  (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset,
   transformers) = load_bace(mode=mode, transform=False, split=split)

  if mode == "regression":
    r2_metric = Metric(metrics.r2_score, verbosity=verbosity)
    rms_metric = Metric(metrics.rms_score, verbosity=verbosity)
    mae_metric = Metric(metrics.mae_score, verbosity=verbosity)
    all_metrics = [r2_metric, rms_metric, mae_metric]
    metric = r2_metric
    model_class = RandomForestRegressor
    def rf_model_builder(model_params, model_dir):
      sklearn_model = RandomForestRegressor(**model_params)
      return SklearnModel(sklearn_model, model_dir)
  elif mode == "classification":
    roc_auc_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
    accuracy_metric = Metric(metrics.accuracy_score, verbosity=verbosity)
    mcc_metric = Metric(metrics.matthews_corrcoef, verbosity=verbosity)
    # Note sensitivity = recall
    recall_metric = Metric(metrics.recall_score, verbosity=verbosity)
    model_class = RandomForestClassifier
    all_metrics = [accuracy_metric, mcc_metric, recall_metric, roc_auc_metric]
    metric = roc_auc_metric 
    def rf_model_builder(model_params, model_dir):
      sklearn_model = RandomForestClassifier(**model_params)
      return SklearnModel(sklearn_model, model_dir)
  else:
    raise ValueError("Invalid mode %s" % mode)

  params_dict = {
      "n_estimators": [10, 100],
      "max_features": ["auto", "sqrt", "log2", None],
      }

  optimizer = HyperparamOpt(rf_model_builder, verbosity="low")
  best_rf, best_rf_hyperparams, all_rf_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, transformers,
      metric=metric)

  if len(train_dataset) > 0:
    rf_train_evaluator = Evaluator(best_rf, train_dataset, transformers,
                                   verbosity=verbosity)
    csv_out = "rf_%s_%s_train.csv" % (mode, split)
    stats_out = "rf_%s_%s_train_stats.txt" % (mode, split)
    rf_train_score = rf_train_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("RF Train set scores: %s" % (str(rf_train_score)))

  if len(valid_dataset) > 0:
    rf_valid_evaluator = Evaluator(best_rf, valid_dataset, transformers,
                                   verbosity=verbosity)
    csv_out = "rf_%s_%s_valid.csv" % (mode, split)
    stats_out = "rf_%s_%s_valid_stats.txt" % (mode, split)
    rf_valid_score = rf_valid_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("RF Valid set scores: %s" % (str(rf_valid_score)))

  if len(test_dataset) > 0:
    rf_test_evaluator = Evaluator(best_rf, test_dataset, transformers,
                                  verbosity=verbosity)
    csv_out = "rf_%s_%s_test.csv" % (mode, split)
    stats_out = "rf_%s_%s_test_stats.txt" % (mode, split)
    rf_test_score = rf_test_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("RF Test set: %s" % (str(rf_test_score)))

  if len(crystal_dataset) > 0:
    rf_crystal_evaluator = Evaluator(best_rf, crystal_dataset, transformers,
                                     verbosity)
    csv_out = "rf_%s_%s_crystal.csv" % (mode, split)
    stats_out = "rf_%s_%s_crystal_stats.txt" % (mode, split)
    rf_crystal_score = rf_crystal_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("RF Crystal set: %s" % (str(rf_crystal_score)))
예제 #3
0
                params_model["batch_size"] = [50]

            if model_name == "Weave":
                params_model = params_dict[model_name]
                params_model["n_tasks"] = [len(wang_tasks)]
                params_model["batch_size"] = [50]

            def model_builder(model_params, model_dir):
                return model_obj(**model_params, model_dir=model_dir)

            # Pearson metric won't be applicable for small scaffolds!
            # For smaller size scaffolds use rms
            metric = dc.metrics.Metric(dc.metrics.mae_score)
            opt = HyperparamOpt(model_builder)
            model, score_best, all_results = opt.hyperparam_search(
                params_model, wang_train, wang_valid, wang_transformers,
                metric)

            logging.info(f"Best score for {model_name}: {score_best}")
            logging.info(f"All results for {model_name}: {all_results}")
            #logging.info(f"Best params for {model_name}: {params_best}")
            params_save[model_name] = str(deepcopy(score_best))
            #model = model_obj(**params_best)

            logging.info(f"Fitting the best model model: {model_name}")
            model.fit(wang_train, nb_epoch=10)

            train_scores = model.evaluate(wang_train, [metric],
                                          wang_transformers)
            valid_scores = generate_scaffold_metrics(model, wang_valid, metric,
                                                     wang_transformers)
예제 #4
0
def bace_dnn_model(mode="classification", verbosity="high", split="20-80"):
  """Train fully-connected DNNs on BACE dataset."""
  (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset,
   transformers) = load_bace(mode=mode, transform=True, split=split)

  if mode == "regression":
    r2_metric = Metric(metrics.r2_score, verbosity=verbosity)
    rms_metric = Metric(metrics.rms_score, verbosity=verbosity)
    mae_metric = Metric(metrics.mae_score, verbosity=verbosity)
    all_metrics = [r2_metric, rms_metric, mae_metric]
    metric = r2_metric
  elif mode == "classification":
    roc_auc_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
    accuracy_metric = Metric(metrics.accuracy_score, verbosity=verbosity)
    mcc_metric = Metric(metrics.matthews_corrcoef, verbosity=verbosity)
    # Note sensitivity = recall
    recall_metric = Metric(metrics.recall_score, verbosity=verbosity)
    all_metrics = [accuracy_metric, mcc_metric, recall_metric, roc_auc_metric]
    metric = roc_auc_metric 
  else:
    raise ValueError("Invalid mode %s" % mode)

  params_dict = {"learning_rate": np.power(10., np.random.uniform(-5, -3, size=5)),
                 "decay": np.power(10, np.random.uniform(-6, -4, size=5)),
                 "nb_epoch": [40] }

  n_features = train_dataset.get_data_shape()[0]
  def model_builder(model_params, model_dir):
    keras_model = MultiTaskDNN(
        len(bace_tasks), n_features, "classification", dropout=.5,
        **model_params)
    return KerasModel(keras_model, model_dir)

  optimizer = HyperparamOpt(model_builder, verbosity="low")
  best_dnn, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, transformers,
      metric=metric)

  if len(train_dataset) > 0:
    dnn_train_evaluator = Evaluator(best_dnn, train_dataset, transformers)            
    csv_out = "dnn_%s_%s_train.csv" % (mode, split)
    stats_out = "dnn_%s_%s_train_stats.txt" % (mode, split)
    dnn_train_score = dnn_train_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("DNN Train set %s: %s" % (metric.name, str(dnn_train_score)))

  if len(valid_dataset) > 0:
    dnn_valid_evaluator = Evaluator(best_dnn, valid_dataset, transformers)            
    csv_out = "dnn_%s_%s_valid.csv" % (mode, split)
    stats_out = "dnn_%s_%s_valid_stats.txt" % (mode, split)
    dnn_valid_score = dnn_valid_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("DNN Valid set %s: %s" % (metric.name, str(dnn_valid_score)))
                                                                                               
  if len(test_dataset) > 0:
    dnn_test_evaluator = Evaluator(best_dnn, test_dataset, transformers)
    csv_out = "dnn_%s_%s_test.csv" % (mode, split)
    stats_out = "dnn_%s_%s_test_stats.txt" % (mode, split)
    dnn_test_score = dnn_test_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("DNN Test set %s: %s" % (metric.name, str(dnn_test_score)))

  if len(crystal_dataset) > 0:
    dnn_crystal_evaluator = Evaluator(best_dnn, crystal_dataset, transformers)
    csv_out = "dnn_%s_%s_crystal.csv" % (mode, split)
    stats_out = "dnn_%s_%s_crystal_stats.txt" % (mode, split)
    dnn_crystal_score = dnn_crystal_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("DNN Crystal set %s: %s" % (metric.name, str(dnn_crystal_score)))