Exemplo n.º 1
0
 def oned_svm(self):
     solo_fns = os.listdir(self.solo_dp)
     combo_fns = os.listdir(self.combo_dp)
     df_dict = {'Label': [], 'Accuracy': []}
     for combo_fn in combo_fns:
         fpi = os.path.join(self.combo_dp, combo_fn)
         combo_df = pd.read_csv(fpi, sep='\t', index_col=0)
         labels = [
             label for label in list(combo_df)
             if label != 'Protein ID' and label != 'y'
         ]
         for label in labels:
             full_label = '{} {}'.format(combo_fn[5:-4], label)
             df_dict['Label'].append(full_label)
             norm_scores = combo_df[label]
             X = np.array([norm_scores]).T
             y = np.array(combo_df['y']).T
             clf = svms.linear_svc(X, y)
             print(full_label)
             df_dict['Accuracy'].append(clf.score(X, y))
     for solo_fn in solo_fns:
         full_label = solo_fn[5:-4]
         print(full_label)
         df_dict['Label'].append(full_label)
         fpi = os.path.join(self.solo_dp, solo_fn)
         solo_df = pd.read_csv(fpi, sep='\t', index_col=0)
         norm_scores = solo_df['Norm Scores']
         X = np.array([norm_scores]).T
         y = np.array(solo_df['y']).T
         clf = svms.linear_svc(X, y)
         df_dict['Accuracy'].append(clf.score(X, y))
     df_out = pd.DataFrame(df_dict, columns=['Label', 'Accuracy'])
     df_out.to_csv(self.oned_fpo, sep='\t')
Exemplo n.º 2
0
 def svm_len(self):
     df_train = pd.read_csv(self.train_fp, sep='\t', index_col=0)
     y = np.array(df_train['y']).T
     X = np.array(df_train[['Length']])
     lin_clf = svms.linear_svc(X, y)
     print("The accuracy score for the length is {}".format(
         lin_clf.score(X, y)))
Exemplo n.º 3
0
 def find_hyperplane(self):
     """Show that the hyperplane for the set intercept is close to 0"""
     df = pd.read_csv(self.train_fpi, sep='\t', index_col=0)
     seqs = list(df['Sequence'])
     cs = NormScore()
     lc_norm = cs.lc_norm_score(seqs)
     X = np.array([lc_norm]).T
     y = np.array(df['y']).T
     clf = svms.linear_svc(X, y)
     xs = np.arange(-2, 2, 0.01).reshape(1, -1).T
     dists = list(clf.decision_function(xs))
     for x, dist in zip(xs, dists):
         if dist < 0:
             print(x)
             break
Exemplo n.º 4
0
 def check_one_charge(self):
     """
     Result. If you remove K, R, E, your classification accuracy goes to 0.71
     Hypothesis: it is the LCAs with K/R/E that matter the most for
     classification. So what if we only count LCAs with a charged residue?
     """
     df = pd.read_csv(self.train_fpi, sep='\t', index_col=0)
     #df = df[df['y'] == 0]
     seqs = list(df['Sequence'])
     lca_counts = self.count_lca_charge(seqs)
     #plt.hist(lca_counts, bins=20, range=(0, 70))
     #plt.ylim([0, 900])
     #plt.show()
     X = np.array([lca_counts]).T
     y = np.array(df['y']).T
     clf = svms.linear_svc(X, y)
     print(clf.score(X, y))
Exemplo n.º 5
0
 def raw_svm(self, fpi, fpo):
     df_dict = {'SVM score': [], 'Label': []}
     cols = ['Label', 'SVM score']
     rem_cols = ['Protein ID', 'Length', 'y']
     df_in = pd.read_csv(fpi, sep='\t', index_col=0)
     k_lcs = [
         lab for lab in df_in.columns.values.tolist() if lab not in rem_cols
     ]
     for i, k_lc in enumerate(k_lcs):
         print(i)
         print(k_lc)
         raw_scores = df_in[k_lc]
         X = np.array([raw_scores]).T
         y = np.array(df_in['y']).T
         clf = svms.linear_svc(X, y)
         df_dict['SVM score'].append(clf.score(X, y))
         df_dict['Label'].append(k_lc)
     df = pd.DataFrame(df_dict, columns=cols)
     df.to_csv(fpo, sep='\t')
Exemplo n.º 6
0
 def svm_comp(self):
     df_train = pd.read_csv(self.comp_fp, sep='\t', index_col=0)
     y = np.array(df_train['y']).T
     scores = []
     for aa in self.aas:
         cols = [aa]
         X = np.array(df_train[cols])
         lin_clf = svms.linear_svc(X, y)
         score = lin_clf.score(X, y)
         scores.append(score)
     print("The mean accuracy is {} and standard deviation is {} for the "
           "fraction of each amino acid used separately to "
           "classify.".format(np.mean(scores), np.std(scores)))
     all_cols = [aa for aa in self.aas]
     X = np.array(df_train[all_cols])
     rbf_clf = svms.smooth_rbf(X, y)
     score = rbf_clf.score(X, y)
     print("The accuracy score for the fraction of all amino acids used "
           "to classify is {}".format(score))
 def run_svm(self, motif_sum, y):
     X = np.array([motif_sum]).T
     clf = svms.linear_svc(X, y)
     return clf.score(X, y)