"distance":{"type":"numerical"}, "affinity": {"colnames": ("ets1_score","runx1_score")} }, label_map={'cooperative': 1, 'independent': 0}) ).run_all(), "distance,orientation,strength": BestModel(clf="sklearn.ensemble.RandomForestClassifier", param_grid=rf_param_grid, train_data=ct.get_training_df({ "distance":{"type":"numerical"}, "affinity": {"colnames": ("ets1_score","runx1_score")}, "orientation": {"relative":False, "pos_cols": {"ets1_pos":"ets1_ori", "runx1_pos":"runx1_ori"}} }, label_map={'cooperative': 1, 'independent': 0}) ).run_all(), } pl.plot_model_metrics(best_models, path="%s/model/pr.png" % basepath, cvfold=10, score_type="pr", varyline=True, title="Average ROC Curves for Ets1-Runx1") feature_dict = { "distance":{"type":"numerical"}, "affinity": {"colnames": ("ets1_score","runx1_score")}, "orientation": {"relative":False, "pos_cols": {"ets1_pos":"ets1_ori", "runx1_pos":"runx1_ori"}} } train = ct.get_feature_all(feature_dict) label = ct.get_numeric_label({'cooperative': 1, 'independent': 0}) rf = best_models["distance,orientation,strength"][1] rf.fit(train,label) model_name = "%s/model/ets1_runx1_rfmodel.sav" % basepath pickle.dump(rf, open(model_name, 'wb')) print("Model saved in %s" % model_name) # tree.export_graphviz(m.estimators_[5], out_file='tree.dot',
rf_param_grid = { 'n_estimators': [1000], #[500,750,1000], 'max_depth': [10], #[5,10,15], "min_samples_leaf": [10], #[5,10,15], "min_samples_split": [10] #[5,10,15] } best_models = { "all": BestModel(clf="sklearn.ensemble.RandomForestClassifier", param_grid=rf_param_grid, train_data=ct.get_training_df(feature_dict)).run_all() } pl.plot_model_metrics(best_models, cvfold=10, score_type="auc", varyline=True, title="Average ROC Curves for Runx1-Ets1") # # This is usually made based on the best model rf = ensemble.RandomForestClassifier(n_estimators=1000, max_depth=10, min_samples_leaf=10, min_samples_split=10) rf.fit(train, label) model_name = "dist_ori_12merimads.sav" pickle.dump(rf, open(model_name, 'wb')) print("Model saved in %s" % model_name) # tree.export_graphviz(m.estimators_[5], out_file='tree.dot', # feature_names = xt_df.columns,
"dist,ori,shape_ori": BestModel(clf="sklearn.ensemble.RandomForestClassifier", param_grid=rf_param_grid, train_data=cooptr.get_training_df({ "distance": { "type": "numerical" }, "shape_in": { "seqin": 4, "smode": "strength", "direction": "orientation", "positive_cores": ["GGAA", "GGAT"] }, "shape_ou": { "seqin": -4, "smode": "strength", "direction": "orientation", "positive_cores": ["GGAA", "GGAT"] }, "orientation": { "positive_cores": ["GGAA", "GGAT"], "one_hot": True } }), topn=10).run_all() } pl.plot_model_metrics(best_models, cvfold=10, score_type="auc", varyline=True)
"one_hot": one_hot_ori, "pos_cols": { "%s_pos" % s1: "%s_ori" % s1, "%s_pos" % s2: "%s_ori" % s2 } }, }, label_map={ 'cooperative': 1, 'independent': 0 })).run_all(), } pl.plot_model_metrics(best_models, path="%s/model/auc_posfeatures.png" % basepath, cvfold=10, score_type="auc", varyline=True, title="AUC Shape features") rf = best_models["distance,orientation,shape"][1] train = ct.get_feature_all({ "distance": { "type": "numerical" }, "orientation": { "relative": rel_ori, "one_hot": one_hot_ori, "pos_cols": { "%s_pos" % s1: "%s_ori" % s1, "%s_pos" % s2: "%s_ori" % s2 }
"one_hot": True, "pos_cols": { "site_str_pos": "site_str_ori", "site_wk_pos": "site_wk_ori" } } }, label_map={ 'cooperative': 1, 'independent': 0 })).run_all() } pl.plot_model_metrics(best_models, path="output/Ets1Ets1/model/auc_all.png", cvfold=10, score_type="auc", varyline=True, title="Average ROC Curves for Ets1-Ets1") rf = best_models["distance,orientation,strength"][1] train = ct.get_feature_all({ "distance": { "type": "numerical" }, "affinity": { "colnames": ("site_str_score", "site_wk_score") }, "orientation": { "relative": True, "one_hot": True,
"colnames": ("ets_score", "runx_score") }, "orientation": { "relative": False, "pos_cols": { "ets_pos": "ets_ori", "runx_pos": "runx_ori" } } })).run_all(), } pl.plot_model_metrics( best_models, cvfold=10, score_type="auc", varyline=True, title= "Average ROC Curves for Ets1-Runx1\n(using shape and sequence features)" ) # feature_dict = { # "distance":{"type":"numerical"}, # "affinity": {"colnames": ("ets_score","runx_score")}, # "orientation": {"relative":False, "pos_cols": {"ets_pos":"ets_ori", "runx_pos":"runx_ori"}} # } # train = ct.get_feature_all(feature_dict) # label = ct.get_numeric_label({'cooperative': 1, 'additive': 0}) # rf = ensemble.RandomForestClassifier(n_estimators=500, max_depth=5, min_samples_leaf=10, min_samples_split=10) # m = rf.fit(train.values.tolist(),label) # # tree.export_graphviz(m.estimators_[5], out_file='tree.dot',
"direction": "inout" } }), ).run_all(score_type=score_type), "dist,shape-top10": BestModel( clf="sklearn.ensemble.RandomForestClassifier", param_grid=rf_param_grid, train_data=curct.get_training_df({ "distance": { "type": "numerical" }, "shape_in": { "seqin": 4, "smode": "positional", "direction": "inout" }, # maximum seqin is 4 "shape_out": { "seqin": -4, "smode": "positional", "direction": "inout" } }), topn=10).run_all(score_type=score_type), } oriname = ori.replace("/", "") pl.plot_model_metrics(best_models, cvfold=10, score_type=score_type, plotname="auc_%s.png" % oriname)