Exemplo n.º 1
0
                    "distance":{"type":"numerical"},
                    "affinity": {"colnames": ("ets1_score","runx1_score")}
                }, label_map={'cooperative': 1, 'independent': 0})
            ).run_all(),
        "distance,orientation,strength":
            BestModel(clf="sklearn.ensemble.RandomForestClassifier",
              param_grid=rf_param_grid,
              train_data=ct.get_training_df({
                    "distance":{"type":"numerical"},
                    "affinity": {"colnames": ("ets1_score","runx1_score")},
                    "orientation": {"relative":False, "pos_cols": {"ets1_pos":"ets1_ori", "runx1_pos":"runx1_ori"}}
                }, label_map={'cooperative': 1, 'independent': 0})
            ).run_all(),
    }

    pl.plot_model_metrics(best_models, path="%s/model/pr.png" % basepath, cvfold=10, score_type="pr", varyline=True, title="Average ROC Curves for Ets1-Runx1")

    feature_dict = {
        "distance":{"type":"numerical"},
        "affinity": {"colnames": ("ets1_score","runx1_score")},
        "orientation": {"relative":False, "pos_cols": {"ets1_pos":"ets1_ori", "runx1_pos":"runx1_ori"}}
    }
    train = ct.get_feature_all(feature_dict)
    label = ct.get_numeric_label({'cooperative': 1, 'independent': 0})
    rf = best_models["distance,orientation,strength"][1]
    rf.fit(train,label)
    model_name = "%s/model/ets1_runx1_rfmodel.sav" % basepath
    pickle.dump(rf, open(model_name, 'wb'))
    print("Model saved in %s" % model_name)

    # tree.export_graphviz(m.estimators_[5], out_file='tree.dot',
Exemplo n.º 2
0
    rf_param_grid = {
        'n_estimators': [1000],  #[500,750,1000],
        'max_depth': [10],  #[5,10,15],
        "min_samples_leaf": [10],  #[5,10,15],
        "min_samples_split": [10]  #[5,10,15]
    }

    best_models = {
        "all":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=ct.get_training_df(feature_dict)).run_all()
    }
    pl.plot_model_metrics(best_models,
                          cvfold=10,
                          score_type="auc",
                          varyline=True,
                          title="Average ROC Curves for Runx1-Ets1")

    # # This is usually made based on the best model
    rf = ensemble.RandomForestClassifier(n_estimators=1000,
                                         max_depth=10,
                                         min_samples_leaf=10,
                                         min_samples_split=10)
    rf.fit(train, label)
    model_name = "dist_ori_12merimads.sav"
    pickle.dump(rf, open(model_name, 'wb'))
    print("Model saved in %s" % model_name)

    # tree.export_graphviz(m.estimators_[5], out_file='tree.dot',
    #         feature_names = xt_df.columns,
        "dist,ori,shape_ori":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=cooptr.get_training_df({
                      "distance": {
                          "type": "numerical"
                      },
                      "shape_in": {
                          "seqin": 4,
                          "smode": "strength",
                          "direction": "orientation",
                          "positive_cores": ["GGAA", "GGAT"]
                      },
                      "shape_ou": {
                          "seqin": -4,
                          "smode": "strength",
                          "direction": "orientation",
                          "positive_cores": ["GGAA", "GGAT"]
                      },
                      "orientation": {
                          "positive_cores": ["GGAA", "GGAT"],
                          "one_hot": True
                      }
                  }),
                  topn=10).run_all()
    }
    pl.plot_model_metrics(best_models,
                          cvfold=10,
                          score_type="auc",
                          varyline=True)
Exemplo n.º 4
0
                              "one_hot": one_hot_ori,
                              "pos_cols": {
                                  "%s_pos" % s1: "%s_ori" % s1,
                                  "%s_pos" % s2: "%s_ori" % s2
                              }
                          },
                      },
                      label_map={
                          'cooperative': 1,
                          'independent': 0
                      })).run_all(),
    }

    pl.plot_model_metrics(best_models,
                          path="%s/model/auc_posfeatures.png" % basepath,
                          cvfold=10,
                          score_type="auc",
                          varyline=True,
                          title="AUC Shape features")

    rf = best_models["distance,orientation,shape"][1]
    train = ct.get_feature_all({
        "distance": {
            "type": "numerical"
        },
        "orientation": {
            "relative": rel_ori,
            "one_hot": one_hot_ori,
            "pos_cols": {
                "%s_pos" % s1: "%s_ori" % s1,
                "%s_pos" % s2: "%s_ori" % s2
            }
                              "one_hot": True,
                              "pos_cols": {
                                  "site_str_pos": "site_str_ori",
                                  "site_wk_pos": "site_wk_ori"
                              }
                          }
                      },
                      label_map={
                          'cooperative': 1,
                          'independent': 0
                      })).run_all()
    }

    pl.plot_model_metrics(best_models,
                          path="output/Ets1Ets1/model/auc_all.png",
                          cvfold=10,
                          score_type="auc",
                          varyline=True,
                          title="Average ROC Curves for Ets1-Ets1")

    rf = best_models["distance,orientation,strength"][1]

    train = ct.get_feature_all({
        "distance": {
            "type": "numerical"
        },
        "affinity": {
            "colnames": ("site_str_score", "site_wk_score")
        },
        "orientation": {
            "relative": True,
            "one_hot": True,
                          "colnames": ("ets_score", "runx_score")
                      },
                      "orientation": {
                          "relative": False,
                          "pos_cols": {
                              "ets_pos": "ets_ori",
                              "runx_pos": "runx_ori"
                          }
                      }
                  })).run_all(),
    }

    pl.plot_model_metrics(
        best_models,
        cvfold=10,
        score_type="auc",
        varyline=True,
        title=
        "Average ROC Curves for Ets1-Runx1\n(using shape and sequence features)"
    )

    # feature_dict = {
    #     "distance":{"type":"numerical"},
    #     "affinity": {"colnames": ("ets_score","runx_score")},
    #     "orientation": {"relative":False, "pos_cols": {"ets_pos":"ets_ori", "runx_pos":"runx_ori"}}
    # }
    # train = ct.get_feature_all(feature_dict)
    # label = ct.get_numeric_label({'cooperative': 1, 'additive': 0})
    # rf = ensemble.RandomForestClassifier(n_estimators=500, max_depth=5, min_samples_leaf=10, min_samples_split=10)
    # m = rf.fit(train.values.tolist(),label)
    #
    # tree.export_graphviz(m.estimators_[5], out_file='tree.dot',
Exemplo n.º 7
0
                        "direction": "inout"
                    }
                }),
            ).run_all(score_type=score_type),
            "dist,shape-top10":
            BestModel(
                clf="sklearn.ensemble.RandomForestClassifier",
                param_grid=rf_param_grid,
                train_data=curct.get_training_df({
                    "distance": {
                        "type": "numerical"
                    },
                    "shape_in": {
                        "seqin": 4,
                        "smode": "positional",
                        "direction": "inout"
                    },  # maximum seqin is 4
                    "shape_out": {
                        "seqin": -4,
                        "smode": "positional",
                        "direction": "inout"
                    }
                }),
                topn=10).run_all(score_type=score_type),
        }
        oriname = ori.replace("/", "")
        pl.plot_model_metrics(best_models,
                              cvfold=10,
                              score_type=score_type,
                              plotname="auc_%s.png" % oriname)