def get_model_RFE_top_features(self,expression_file,ic50_file,target_features,drug): expression_frame,ic50_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file, ic50_file,drug,normalized=True,trimmed=True,threshold=None) scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series) step_length = int(len(scikit_data.tolist()[0]) / 100) + 1 selector = RFE(self.model,int(target_features),step=step_length) selector.fit(scikit_data,scikit_target) return [expression_frame.index[i] for i in xrange(0,len(expression_frame.index)) if selector.support_[i]]
def get_model_coefficients_threshold(self,expression_file,ic50_file,threshold,drug): if(self.model_type == 'svm' and self.kernel == 'linear'): expression_frame,ic50_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file, ic50_file,drug,normalized=True,trimmed=True,threshold=threshold) scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series) self.model.fit(scikit_data,scikit_target) return expression_frame.index, self.model.coef_[0] else: raise Exception("Method only defined for the SVM linear model")
def get_predictions_full_CCLE_dataset_threshold(self,expression_file,ic50_file,threshold,drug): training_frame,training_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True,threshold=threshold) training_data,training_target = dfm.get_scikit_data_and_target(training_frame,training_series) cell_lines, testing_data = dfm.get_normalized_full_expression_identifiers_and_data(expression_file,training_frame.index) self.model.fit(training_data,training_target) predictions = self.model.predict(testing_data) return cell_lines, predictions
def get_predictions_full_CCLE_dataset_top_features(self,expression_file,ic50_file,num_features,drug): expression_frame,ic50_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True) top_features = dfm.get_pval_top_n_features(expression_frame,ic50_series,num_features) expression_frame = expression_frame.ix[top_features] scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series) cell_lines, testing_data = dfm.get_normalized_full_expression_identifiers_and_data(expression_file,expression_frame.index) self.model.fit(scikit_data,scikit_target) predictions = self.model.predict(testing_data) return cell_lines,predictions,list(top_features)
def backward_step(model,expression_frame,ic50_series,backward_step_size,forward_features_selected,backward_features_removed): removable_features = set(expression_frame.index) - set(forward_features_selected) - set(backward_features_removed) expression_frame = dfm.get_expression_frame_with_features(expression_frame,set(expression_frame.index) - set(backward_features_removed)) scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series) model.fit(scikit_data,scikit_target) coefs = model.coef_[0] feature_names = list(expression_frame.index) coefs,feature_names = zip(*sorted(zip(coefs,feature_names),key=lambda x : math.fabs(x[0]))) num_features_removed = 0 feature_index = 0 while(num_features_removed < backward_step_size and feature_index < len(feature_names)): if(feature_names[feature_index] in removable_features): backward_features_removed.append(feature_names[feature_index]) num_features_removed += 1 feature_index += 1 return backward_features_removed
def forward_step(model,expression_frame,ic50_series,forward_features_selected,backward_features_removed): potential_features = set(expression_frame.index) - set(forward_features_selected) - set(backward_features_removed) max_score = -1 best_feature = None for feature in potential_features: model = copy.copy(model) model_features = set(forward_features_selected) & set(feature) expression_frame = dfm.get_expression_frame_with_features(expression_frame,model_features) scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series) score = cv.cross_val_score(model,scikit_data,scikit_target,cv=5).mean() if(score > max_score): max_score = score best_feature = feature if(best_feature): forward_features_selected.append(best_feature) return forward_features_selected else: return forward_features_selected