import learner_functions as lf import load_data as ld import feature_selection as fs import hard_vote as hv import soft_vote as sv data = ld.LoadData() #create test and train labels gender_labels = data.clinical['gender'].tolist() MSI_labels = data.clinical['msi'].tolist() test_gender_labels = data.test_clinical['gender'].tolist() test_MSI_labels = data.test_clinical['msi'].tolist() #feature selections and subsetted data protein_sub_set = fs.univariate(data.proteomic, gender_labels) selected_protein_columns = list(protein_sub_set.columns.values) test_protein_sub_set = data.test_proteomic.ix[:, selected_protein_columns] print('knn') knn_params = { # Found by parameter optimization in knn-optimization.py "n_neighbors": 11 } #train learners for gender and msi here: knn_gender, knn_gender_score = lf.train_knn(protein_sub_set, gender_labels, **knn_params) knn_msi, knn_msi_score = lf.train_knn(protein_sub_set, MSI_labels, **knn_params) print('lr') lr_gender, lr_gender_score = lf.train_lr(protein_sub_set, gender_labels)
] `y` is mismatch labels: if x[0][i] matches x[1][i], y[i] == 1.0, else 0.0 """ self.model.fit(x, [y], **kwargs) def predict(self, x): """`x` is the same as in the `fit()` method. Returns a list `y` of probabilities of x[0][i] matching x[1][i] """ return self.model.predict(x) if __name__ == "__main__": data = LoadData() pro_data = feature_selection.univariate(data.proteomic, data.clinical) rna_data = feature_selection.univariate(data.rna, data.clinical) prot_x = pd.concat([pro_data, pro_data]) shuffled_rna = rna_data.sample(frac=1) rna_x = pd.concat([rna_data, shuffled_rna]) labels = [1.0] * 80 + [0.0] * 80 network = SiameseNet([(pro_data.shape[-1], ), (rna_data.shape[-1], )]) network.fit([prot_x, rna_x], labels, epochs=500, batch_size=5) truth = pd.read_csv("./data/tidy/sum_tab_2.csv") truth['Score'] = network.predict([pro_data, rna_data]) truth.to_csv("./data/tidy/output/siamese_scores.csv", index=False)
import learner_functions as lf import load_data as ld import feature_selection as fs import find_mismatch as fm import pandas as pd data = ld.LoadData() #create test and train labels gender_labels = data.clinical['gender'].tolist() MSI_labels = data.clinical['msi'].tolist() test_gender_labels = data.test_clinical['gender'].tolist() test_MSI_labels = data.test_clinical['msi'].tolist() mismatch_labels = data.mismatch['mismatch'].tolist() protein_sub_set = fs.univariate(data.proteomic, mismatch_labels) rna_sub_set = fs.univariate(data.rna, mismatch_labels) knn_params = { # Found by parameter optimization in knn-optimization.py "n_neighbors": 11 } # train models for predicting mislabels based on all 3 data sets lf.train_rf(data.train_all.fillna(0), data.mislabel_labels) # ************************************ # TESTING TO FIND MISMATCH INDICES # knn_rna, knn_rna_score = lf.train_knn(rna_sub_set, mismatch_labels, **knn_params) # knn_protein, knn_protein_score = lf.train_knn(protein_sub_set, mismatch_labels, **knn_params) # # lr_rna, lr_rna_score = lf.train_lr(rna_sub_set,mismatch_labels)
def select_features(self): self.rna = univariate(self.rna, self.clinical) self.proteomic = univariate(self.proteomic, self.clinical) self.test_rna = self.test_rna[self.rna.columns] self.test_proteomic = self.test_proteomic[self.proteomic.columns]