def get_predictions_full_CCLE_dataset_top_features(self,expression_file,ic50_file,num_features,drug): expression_frame,ic50_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True) top_features = dfm.get_pval_top_n_features(expression_frame,ic50_series,num_features) expression_frame = expression_frame.ix[top_features] scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series) cell_lines, testing_data = dfm.get_normalized_full_expression_identifiers_and_data(expression_file,expression_frame.index) self.model.fit(scikit_data,scikit_target) predictions = self.model.predict(testing_data) return cell_lines,predictions,list(top_features)
def trim_X_num_features(X,y,train,num_features): """ Do calculations to trim X by taking the top num_features features based on p-value rank """ all_samples = pd.DataFrame(X) all_labels = pd.Series(y) train_samples,train_labels = get_training_samples_labels(all_samples,all_labels,train) features = dfm.get_pval_top_n_features(train_samples.T,train_labels,num_features) trimmed_all_samples = all_samples[features] return np.array([list(trimmed_all_samples.ix[row]) for row in trimmed_all_samples.index])