def _pathway_classification(self,genelist,common_scale=False): ''' Given a genelist from the data set, such as a collection of genes belonging to a specific pathway, this function computes the accuracy of a standard (L2) SVM classifier ''' (D,L) = self.traindat (D2,L2) = self.testdat G = self.feature_names idxs = [] for x in genelist: while x in self.known_aliases: x = self.known_aliases[x] #alias could be key to another alias...so go down until we stop finding substitutions if x in self.known_bad: continue #skip the ones we know are not resolvable try: idx = G.index(x) except(ValueError): print "Gene %s is not a known feature name. Querying for official name..."%x xn = ifr.get_official_name(x, entrezgene=True) if (xn is None) or (xn==x): xn = None self.known_bad.append(x) print "No other official name found, gene will be skipped." else: print "Substituting %s for %s."%(xn,x) self.known_aliases[x] = xn try: idx = G.index(xn) if (not xn is None) else None except(ValueError): idx = None #happens when official name is found, but still not in list if not idx is None: idxs.append(idx) rc = ifr.svm_engine((D,L), (D2,L2), perm=idxs, common_scale=common_scale, verbose=False, no_normalization=False, loss="L2", penalty="L2", C=1.0) return rc
def pathway_classification(genelist, feature_names=None, traindat=None, testdat=None, return_err_idxs=False, common_scale=False, verbose=True ): ''' Given a genelist from the flu data set, such as a collection of genes belonging to a specific pathway, this function computes the accuracy of a standard (L2) SVM classifier @param genelist: The list of genes forming the pathway @param feature_names: The list of feature (gene) names associated with the columns of test/train data. The feature_names must contain the genes in genelist, or a lookup will be performed to find the matching alias. If None, then feature names will be the genes from the Duke Influenza data. @param traindat: If None, then H3N2 data will be used. Else, specify the tuple (D,L) where D is the data matrix (samples in rows) and L is the label vector @param testdat: If None, then H1N1 data will be used. Else, specify (D2,L2) tuple for test data. @param return_err_idxs: If true, then the indexes in the test set where the classifier is wrong will be returned. @param common_scale: If true, then the test data will be scaled using the training data mean/std, else it will be scaled using its own mean/std. @param verbose: If true, more output will be displayed. @return: Either returns rc or rc, err_set, where rc is the return from the svm engine, which is the tuple (test_accuracy, factors, clf, train_accuracy). See L{ifr.svm_engine}. ''' global PC_KNOWN_BAD global PC_KNOWN_ALIASES if traindat is None: (D,L,_,_) = ifr.load_flu_mat() else: (D,L) = traindat if testdat is None: (D2,L2,_,_) = ifr.load_H1N1_mat() else: (D2,L2) = testdat G = feature_names if (not feature_names is None) else ( ifr.load_gene_ids(short_name_only=True) ) idxs = [] for x in genelist: while x in PC_KNOWN_ALIASES: if verbose: print "%s is known alias to %s."%(x, PC_KNOWN_ALIASES[x]) x = PC_KNOWN_ALIASES[x] #alias could be key to another alias...so go down until we stop finding substitutions if x in PC_KNOWN_BAD: if verbose: print "%s is known bad, skipping gene."%x continue try: idx = G.index(x) except(ValueError): #x is probably an alias to a gene name in G if verbose: print "Gene %s is not a known feature name. Querying for official name..."%x xn = ifr.get_official_name(x, entrezgene=True) if (xn is None) or (xn==x): if verbose: print "No other official name found, gene will be skipped." xn = None PC_KNOWN_BAD.append(x) else: print "Substituting %s for %s."%(xn,x) PC_KNOWN_ALIASES[x] = xn try: idx = G.index(xn) if (not xn is None) else None except(ValueError): idx = None #happens when official name is found, but still not in list if not idx is None: idxs.append(idx) rc = ifr.svm_engine((D,L), (D2,L2), perm=idxs, common_scale=common_scale, verbose=False, no_normalization=False, loss="L2", penalty="L2", C=1.0) if return_err_idxs: IX = sp.array( range(len(L2))) errs = IX[ rc[4] != L2] #indexes where prediction is not correct err_set = set(errs) return rc, err_set else: return rc