Exemplo n.º 1
0
        }
    }
    train = ct.get_feature_all(feature_dict)
    label = ct.get_numeric_label({'cooperative': 1, 'additive': 0})

    rf_param_grid = {
        'n_estimators': [1000],  #[500,750,1000],
        'max_depth': [10],  #[5,10,15],
        "min_samples_leaf": [10],  #[5,10,15],
        "min_samples_split": [10]  #[5,10,15]
    }

    best_models = {
        "all":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=ct.get_training_df(feature_dict)).run_all()
    }
    pl.plot_model_metrics(best_models,
                          cvfold=10,
                          score_type="auc",
                          varyline=True,
                          title="Average ROC Curves for Runx1-Ets1")

    # # This is usually made based on the best model
    rf = ensemble.RandomForestClassifier(n_estimators=1000,
                                         max_depth=10,
                                         min_samples_leaf=10,
                                         min_samples_split=10)
    rf.fit(train, label)
    model_name = "dist_ori_12merimads.sav"
    rf_param_grid = {
        'n_estimators': [500, 100, 1500],
        'max_depth': [5, 10, 15],
        "min_samples_leaf": [10, 15, 20],
        "min_samples_split": [10, 15, 20]
    }

    # TODO: choose per orientation
    best_models = {
        "dist,ori":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=cooptr.get_training_df({
                      "distance": {
                          "type": "numerical"
                      },
                      "orientation": {
                          "positive_cores": ["GGAA", "GGAT"],
                          "one_hot": True
                      }
                  }),
                  topn=10).run_all(),
        "dist,shape_inout":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=cooptr.get_training_df({
                      "distance": {
                          "type": "numerical"
                      },
                      "shape_in": {
                          "seqin": 4,
                          "smode": "strength",
Exemplo n.º 3
0
    df['label'] = df['label'].replace('independent', 'additive')
    ct = CoopTrain(df)
    pd.set_option("display.max_columns", None)

    rf_param_grid = {
        'n_estimators': [500],  #[500,750,1000],
        'max_depth': [5],  #[5,10,15],
        "min_samples_leaf": [5],  #[5,10,15],
        "min_samples_split": [5]  #[5,10,15]
    }

    best_models = {
        "strength":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=ct.get_training_df(
                      {"affinity": {
                          "colnames": ("ets_score", "runx_score")
                      }})).run_all(),
        "distance":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=ct.get_training_df(
                      {"distance": {
                          "type": "numerical"
                      }})).run_all(),
        "orientation":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=ct.get_training_df({
                      "orientation": {
                          "relative": False,
Exemplo n.º 4
0
    pd.set_option("display.max_columns", None)

    rf_param_grid = {
        'n_estimators': [1000],  #[500,750,1000],
        'max_depth': [10],  #[5,10,15],
        "min_samples_leaf": [5],  #[5,10,15],
        "min_samples_split": [5]  #[5,10,15]
    }

    best_models = {
        "Weaker site strength":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=ct.get_training_df(
                      {"affinity": {
                          "colnames": ["site_str_score"]
                      }},
                      label_map={
                          'cooperative': 1,
                          'independent': 0
                      })).run_all(),
        "Stronger site strength":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=ct.get_training_df(
                      {"affinity": {
                          "colnames": ["site_wk_score"]
                      }},
                      label_map={
                          'cooperative': 1,
                          'independent': 0
                      })).run_all(),
Exemplo n.º 5
0
        "min_samples_leaf": [10],  #[5,10,15],
        "min_samples_split": [20],  #[5,10,15]
    }

    best_models = {
        "distance,orientation":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=ct.get_training_df(
                      {
                          "distance": {
                              "type": "numerical"
                          },
                          "orientation": {
                              "relative": rel_ori,
                              "one_hot": one_hot_ori,
                              "pos_cols": {
                                  "%s_pos" % s1: "%s_ori" % s1,
                                  "%s_pos" % s2: "%s_ori" % s2
                              }
                          },
                      },
                      label_map={
                          'cooperative': 1,
                          'independent': 0
                      })).run_all(),
        "distance,orientation,sequence":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=ct.get_training_df(
                      {
                          "distance": {
    rf_param_grid = {
        'n_estimators': [500],  #[500,750,1000],
        'max_depth': [10],  #[5,10,15],
        "min_samples_leaf": [10],  #[5,10,15],
        "min_samples_split": [20],  #[5,10,15]
    }

    best_models = {
        "distance,orientation":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=ct.get_training_df({
                      "distance": {
                          "type": "numerical"
                      },
                      "orientation": {
                          "relative": False,
                          "pos_cols": {
                              "ets_pos": "ets_ori",
                              "runx_pos": "runx_ori"
                          }
                      }
                  })).run_all(),
        "distance,orientation,sequence":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=ct.get_training_df({
                      "distance": {
                          "type": "numerical"
                      },
                      "orientation": {
                          "relative": False,
Exemplo n.º 7
0
    # using custom imads model
    imads8_paths = ["input/site_models/imads_models/Ets1_w8_GGAA.model", "input/site_models/imads_models/Ets1_w8_GGAT.model"]
    imads8_cores = ["GGAA", "GGAT"]
    imads8_models = [iMADSModel(path, core, 8, [1, 2, 3]) for path, core in zip(imads8_paths, imads8_cores)]
    imads8 = iMADS(imads8_models, 0.19) # 0.2128

    imads12_paths = ["input/site_models/imads_models/Ets1_w12_GGAA.model", "input/site_models/imads_models/Ets1_w12_GGAT.model"]
    imads12_cores = ["GGAA", "GGAT"]
    imads12_models = [iMADSModel(path, core, 12, [1, 2, 3]) for path, core in zip(imads12_paths, imads12_cores)]
    imads12 = iMADS(imads12_models, 0.19) # 0.2128

    best_models = {
        "distance":
            BestModel(clf="sklearn.ensemble.RandomForestClassifier",
              param_grid=rf_param_grid,
              train_data=cooptr.get_training_df({
                    "distance":{"type":"numerical"}
                })
            ).run_all(),
        "orientation":
            BestModel(clf="sklearn.ensemble.RandomForestClassifier",
              param_grid=rf_param_grid,
              train_data=cooptr.get_training_df({
                    "orientation": {"positive_cores":["GGAA","GGAT"], "one_hot":True}
                })
            ).run_all(),
        "strength":
            BestModel(clf="sklearn.ensemble.RandomForestClassifier",
              param_grid=rf_param_grid,
              train_data=cooptr.get_training_df({
                    "affinity": {"imads": imads12}
                })
Exemplo n.º 8
0
        'n_estimators': [500, 1000, 1500],
        'max_depth': [5, 10, 15],
        "min_samples_leaf": [5, 10, 15],
        "min_samples_split": [5, 10, 15]
    }

    orientations = ["HH", "TT", "HT/TH"]
    for ori in orientations:
        curdf = df[df["orientation"] == ori]
        curct = CoopTrain(curdf, corelen=4, positive_cores=["GGAA", "GGAT"])
        best_models = {
            "distance":
            BestModel(
                clf="sklearn.ensemble.RandomForestClassifier",
                param_grid=rf_param_grid,
                train_data=curct.get_training_df(
                    {"distance": {
                        "type": "numerical"
                    }}),
            ).run_all(score_type=score_type),
            "shape":
            BestModel(
                clf="sklearn.ensemble.RandomForestClassifier",
                param_grid=rf_param_grid,
                train_data=curct.get_training_df({
                    "shape_in": {
                        "seqin": 4,
                        "smode": "positional",
                        "direction": "inout"
                    },  # maximum seqin is 4
                    "shape_out": {
                        "seqin": -4,