from sklearn.preprocessing import StandardScaler from sklearn.feature_selection import SelectKBest, f_regression from sklearn.cross_validation import ShuffleSplit from sklearn.metrics import make_scorer from pandas import DataFrame, Series from dream_2014_functions import read_data_sets, save_gct_data, submit_solution, write_features_sc3, ev_code_sc3 def spearm_cor_func(expected, pred): return spearmanr(expected, pred)[0] # Folders submission_filename_prefix = 'sc3_emanuel_phase2_' # Import data train_exp, train_cnv, train_ess, leader_exp, leader_cnv, prioritized_genes = read_data_sets() X_train_pre = train_exp X_test_pre = leader_exp var_thres = VarianceThreshold(0.65).fit(X_train_pre) X_train_pre = X_train_pre.loc[:, var_thres.get_support()] X_test_pre = X_test_pre.loc[:, var_thres.get_support()] # Prepare features features = X_train_pre.columns important_features = [] for gene in prioritized_genes: # Assemble prediction variables X_train = X_train_pre
return 1 / ((np.median(matrix, axis=0) / matrix) ** hill_coef + 1) def count_outliers(matrix): outliers_counts = [] for i in range(len(matrix.columns)): Q1 = np.percentile(matrix.ix[:, i], 25) Q3 = np.percentile(matrix.ix[:, i], 75) IQR = Q3 - Q1 outliers_counts.append(sum(matrix.ix[:, i] < (Q1 - 1.5 * IQR)) + sum(matrix.ix[:, i] > (Q3 + 1.5 * IQR))) return outliers_counts # Import data-sets exp, cnv, ess, leader_exp, leader_cnv, prioritized_genes = read_data_sets() # Split training data-set in two train_exp = exp.loc[training_cell_lines, ] train_cnv = cnv.loc[training_cell_lines, ] train_ess = ess.loc[training_cell_lines, ] pred_exp = exp.loc[leader_board_cell_lines, ] pred_cnv = cnv.loc[leader_board_cell_lines, ] pred_ess = ess.loc[leader_board_cell_lines, ].T # Configurations predictions = DataFrame(None, index=prioritized_genes, columns=pred_ess.axes[1]) spearman = make_scorer(spearm_cor_func, greater_is_better=True) predictions_features = {}