Пример #1
0
import learner_functions as lf
import load_data as ld
import feature_selection as fs
import hard_vote as hv
import soft_vote as sv

data = ld.LoadData()

#create test and train labels
gender_labels = data.clinical['gender'].tolist()
MSI_labels = data.clinical['msi'].tolist()
test_gender_labels = data.test_clinical['gender'].tolist()
test_MSI_labels = data.test_clinical['msi'].tolist()

#feature selections and subsetted data
protein_sub_set = fs.univariate(data.proteomic, gender_labels)
selected_protein_columns = list(protein_sub_set.columns.values)
test_protein_sub_set = data.test_proteomic.ix[:, selected_protein_columns]

print('knn')
knn_params = {  # Found by parameter optimization in knn-optimization.py
    "n_neighbors": 11
}
#train learners for gender and msi here:
knn_gender, knn_gender_score = lf.train_knn(protein_sub_set, gender_labels,
                                            **knn_params)
knn_msi, knn_msi_score = lf.train_knn(protein_sub_set, MSI_labels,
                                      **knn_params)

print('lr')
lr_gender, lr_gender_score = lf.train_lr(protein_sub_set, gender_labels)
Пример #2
0
        ]
        `y` is mismatch labels:
            if x[0][i] matches x[1][i], y[i] == 1.0, else 0.0
        """
        self.model.fit(x, [y], **kwargs)

    def predict(self, x):
        """`x` is the same as in the `fit()` method.
        Returns a list `y` of probabilities of x[0][i] matching x[1][i]
        """
        return self.model.predict(x)


if __name__ == "__main__":
    data = LoadData()
    pro_data = feature_selection.univariate(data.proteomic, data.clinical)
    rna_data = feature_selection.univariate(data.rna, data.clinical)

    prot_x = pd.concat([pro_data, pro_data])
    shuffled_rna = rna_data.sample(frac=1)
    rna_x = pd.concat([rna_data, shuffled_rna])
    labels = [1.0] * 80 + [0.0] * 80

    network = SiameseNet([(pro_data.shape[-1], ), (rna_data.shape[-1], )])
    network.fit([prot_x, rna_x], labels, epochs=500, batch_size=5)

    truth = pd.read_csv("./data/tidy/sum_tab_2.csv")
    truth['Score'] = network.predict([pro_data, rna_data])

    truth.to_csv("./data/tidy/output/siamese_scores.csv", index=False)
Пример #3
0
import learner_functions as lf
import load_data as ld
import feature_selection as fs
import find_mismatch as fm
import pandas as pd

data = ld.LoadData()

#create test and train labels
gender_labels = data.clinical['gender'].tolist()
MSI_labels = data.clinical['msi'].tolist()
test_gender_labels = data.test_clinical['gender'].tolist()
test_MSI_labels = data.test_clinical['msi'].tolist()
mismatch_labels = data.mismatch['mismatch'].tolist()

protein_sub_set = fs.univariate(data.proteomic, mismatch_labels)
rna_sub_set = fs.univariate(data.rna, mismatch_labels)

knn_params = {  # Found by parameter optimization in knn-optimization.py
    "n_neighbors": 11
}

# train models for predicting mislabels based on all 3 data sets
lf.train_rf(data.train_all.fillna(0), data.mislabel_labels)

# ************************************
# TESTING TO FIND MISMATCH INDICES
# knn_rna, knn_rna_score = lf.train_knn(rna_sub_set, mismatch_labels, **knn_params)
# knn_protein, knn_protein_score = lf.train_knn(protein_sub_set, mismatch_labels, **knn_params)
#
# lr_rna, lr_rna_score = lf.train_lr(rna_sub_set,mismatch_labels)
Пример #4
0
 def select_features(self):
     self.rna = univariate(self.rna, self.clinical)
     self.proteomic = univariate(self.proteomic, self.clinical)
     self.test_rna = self.test_rna[self.rna.columns]
     self.test_proteomic = self.test_proteomic[self.proteomic.columns]