seqlbled = pd.read_csv("%s/chip2probe/output/homotypic/training/seqlbled.csv" % basepath) wtdf = get_wtdf("%s/chip2probe/output/array_design_files/Coop2Ets_validation/custom_probes_selected.csv" % basepath, seqlbled) origdf, neg_orig = arr.read_chamber_file("%s/probedata/191030_coop-PBM_Ets1_v1_2nd/2.processed_gpr/20191004_258614510001_ETS1_550_5_1-4_alldata.txt"%basepath, seqcols=["Name","type","rep","ori"], negcols=["Name","rep","ori"], key="Coop1Ets") origdf[["Sequence","type","ori"]].drop_duplicates().to_csv("seqsorig.csv",index=False) import sys sys.exit() cust10df, neg10_cust = arr.read_chamber_file("%s/probedata/201128_validation_array_ets1_v2_1/10nMEts1_alexa488_550_20_alldata.txt"%basepath, key="Coop2Ets") cust20df, neg20_cust = arr.read_chamber_file("%s/probedata/210102_validation_array_ets1_v2_2/20nMEts1_alexa488_550_10_alldata.txt"%basepath, key="Coop2Ets") cust30df, neg30_cust = arr.read_chamber_file("%s/probedata/210102_validation_array_ets1_v2_2/30nMEts1_alexa488_550_10_alldata.txt"%basepath, key="Coop2Ets") imads_paths = ["%s/chip2probe/input/site_models/imads_models/Ets1_w12_GGAA.model" % basepath, "%s/chip2probe/input/site_models/imads_models/Ets1_w12_GGAT.model" % basepath] imads_cores = ["GGAA", "GGAT"] imads_models = [iMADSModel(path, core, 12, [1, 2, 3]) for path, core in zip(imads_paths, imads_cores)] imads = iMADS(imads_models, 0.19) # 0.2128 orig_pred = imads.predict_sequences(get_wtmt(origdf, wtdf), key_colname="id_numeric", sequence_colname="Sequence") orig_plot = imads.make_plot_data(orig_pred) cust_pred = imads.predict_sequences(get_wtmt(cust10df, wtdf), key_colname="id_numeric", sequence_colname="Sequence") cust_plot = imads.make_plot_data(cust_pred) sp = SitesPlotter() sp.plot_seq_combine([orig_plot], filepath="origplot.pdf") sp.plot_seq_combine([cust_plot], filepath="custplot.pdf") import sys sys.exit()
df = pd.read_csv(trainingpath, sep="\t") # select only genomic (i.e. non-custom) sequences # df = df[~df['name'].str.contains("dist|weak")] ct = CoopTrain(df, corelen=4, flip_th=True, positive_cores=["GGAA", "GGAT"]) # using custom imads model imads_paths = [ "input/site_models/imads_models/Ets1_w12_GGAA.model", "input/site_models/imads_models/Ets1_w12_GGAT.model" ] imads_cores = ["GGAA", "GGAT"] imads_models = [ iMADSModel(path, core, 12, [1, 2, 3]) for path, core in zip(imads_paths, imads_cores) ] imads = iMADS(imads_models, 0.19) # 0.2128 # get the features from the CoopTrain class feature_dict = { "distance": { "type": "numerical" }, "orientation": { "positive_cores": ["GGAA", "GGAT"], "one_hot": True }, "affinity": { "imads": imads
df = pd.read_csv(trainingpath) #, sep="\t") # select only genomic (i.e. non-custom) sequences df = df[~df['name'].str.contains("dist|weak")] cooptr = CoopTrain(df, corelen=4) rf_param_grid = { 'n_estimators': [500], #, 1000, 1500], 'max_depth':[5], # 10, 15], "min_samples_leaf" : [10], # 15, 20], "min_samples_split" :[10], # 15 ,20] } # using custom imads model imads8_paths = ["input/site_models/imads_models/Ets1_w8_GGAA.model", "input/site_models/imads_models/Ets1_w8_GGAT.model"] imads8_cores = ["GGAA", "GGAT"] imads8_models = [iMADSModel(path, core, 8, [1, 2, 3]) for path, core in zip(imads8_paths, imads8_cores)] imads8 = iMADS(imads8_models, 0.19) # 0.2128 imads12_paths = ["input/site_models/imads_models/Ets1_w12_GGAA.model", "input/site_models/imads_models/Ets1_w12_GGAT.model"] imads12_cores = ["GGAA", "GGAT"] imads12_models = [iMADSModel(path, core, 12, [1, 2, 3]) for path, core in zip(imads12_paths, imads12_cores)] imads12 = iMADS(imads12_models, 0.19) # 0.2128 best_models = { "distance": BestModel(clf="sklearn.ensemble.RandomForestClassifier", param_grid=rf_param_grid, train_data=cooptr.get_training_df({ "distance":{"type":"numerical"} }) ).run_all(),