def main(): # load dataset info print('loading dataset info...', flush=True) dataset_info_path = 'datasets/candidate_features/dataset_info.txt' dataset_infos = datasetIO.load_datasetinfo(dataset_info_path) # specify results folder print('specifying results folder...', flush=True) results_folder = 'datasets/nonredundant_features' if not os.path.exists(results_folder): os.mkdir(results_folder) # iterate over datasets print('iterating over datasets...', flush=True) for dataset_info in dataset_infos: # # just work with hpatissuesmrna for testing/debugging the pipeline # if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned': # print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True) # continue # check if another python instance is already working on this dataset if os.path.exists('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation'])): print('skipping {0}. already in progress...'.format( dataset_info['abbreviation']), flush=True) continue # log start of processing with open('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: print('working on {0}...'.format(dataset_info['abbreviation']), flush=True) fw.write('working on {0}...'.format(dataset_info['abbreviation'])) # load dataset print('loading dataset...', flush=True) gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path']) gene_atb.columnmeta['isrowstat'] = gene_atb.columnmeta[ 'isrowstat'].astype('int64').astype('bool') # decide feature similarity metric print('deciding feature similarity metric...', flush=True) if ('standardized' in dataset_info['abbreviation'] or 'cleaned' in dataset_info['abbreviation'] ) and (gene_atb.matrix == 0).sum() / gene_atb.size <= 0.5: # dataset is many-valued and filled-in print(' dataset is many-valued and filled-in...', flush=True) print(' using spearman for similarity...', flush=True) dataset_info['feature_similarity_metric'] = 'spearman' dataset_info['feature_similarity_threshold'] = np.sqrt(0.5) else: # dataset is binary or tertiary or sparse print(' dataset is binary, tertiary, or sparse...', flush=True) print(' using cosine for similarity...', flush=True) dataset_info['feature_similarity_metric'] = 'cosine' dataset_info['feature_similarity_threshold'] = np.sqrt(0.5) # calculate feature similarity print('calculating feature similarity...', flush=True) atb_atb = gene_atb.tosimilarity( axis=1, metric=dataset_info['feature_similarity_metric']) # prioritize feature groups print('prioritizing feature groups...', flush=True) are_similar_features = np.abs( atb_atb.matrix) > dataset_info['feature_similarity_threshold'] feature_group_size = are_similar_features.sum(1).astype('float64') feature_group_score = (np.abs( atb_atb.matrix) * are_similar_features).sum(1) / feature_group_size feature_priority = np.zeros(gene_atb.shape[1], dtype='float64') feature_priority[gene_atb.columnlabels == 'mean'] = 1.0 feature_priority[gene_atb.columnlabels == 'stdv'] = 0.5 feature_infos = list( zip(np.arange(gene_atb.shape[1], dtype='int64'), gene_atb.columnlabels.copy(), feature_group_size.copy(), feature_priority.copy(), feature_group_score.copy())) feature_infos.sort(key=itemgetter(4), reverse=True) feature_infos.sort(key=itemgetter(3), reverse=True) feature_infos.sort(key=itemgetter(2), reverse=True) # for feature_info in feature_infos: # print('{0:1.3g}, {1}, {2:1.3g}, {3:1.3g}, {4:1.3g}'.format(feature_info[0], feature_info[1], feature_info[2], feature_info[3], feature_info[4])) sorted_feature_indices = np.array( [feature_info[0] for feature_info in feature_infos], dtype='int64') atb_atb.reorder(sorted_feature_indices, axis=0) atb_atb.reorder(sorted_feature_indices, axis=1) gene_atb.reorder(sorted_feature_indices, axis=1) are_similar_features = are_similar_features[ sorted_feature_indices, :][:, sorted_feature_indices] # group similar features print('grouping similar features...', flush=True) tobediscarded = np.zeros(gene_atb.shape[1], dtype='bool') gene_atb.columnmeta['similar_features'] = np.full(gene_atb.shape[1], '', dtype='object') gene_atb.columnmeta['preferred_rowstat'] = np.full(gene_atb.shape[1], '', dtype='object') rowstats = gene_atb.columnlabels[gene_atb.columnmeta['isrowstat']] with open('{0}/{1}_feature_groups.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: for i, feature in enumerate(gene_atb.columnlabels): if ~tobediscarded[i]: # find similar features print(' finding features similar to feature "{0}"...'. format(feature), flush=True) similarity_hit = are_similar_features[i, :] similarity_hit = np.logical_and( similarity_hit, ~tobediscarded) # just what's new similarity_hit[:i] = False similar_features = gene_atb.columnlabels[similarity_hit] similarity_values = atb_atb.matrix[i, similarity_hit] rowstat_is_in_group = np.in1d(rowstats, similar_features) gene_atb.columnmeta['similar_features'][i] = '|'.join( similar_features.tolist()) if rowstat_is_in_group.any(): # replace feature with summary stat gene_atb.columnmeta['preferred_rowstat'][i] = rowstats[ rowstat_is_in_group.nonzero()[0][0]] gene_atb.matrix[:, i] = gene_atb.select( [], gene_atb.columnmeta['preferred_rowstat'][i]) print( ' replacing feature "{0}" with summary stat "{1}"...' .format( feature, gene_atb.columnmeta['preferred_rowstat'][i]), flush=True) elif similarity_hit.sum() > 1: # replace feature with group average print( ' replacing feature "{0}" with average of {1!s} features...' .format(feature, similarity_hit.sum()), flush=True) feature_weight = atb_atb.matrix[i, similarity_hit] feature_weight = feature_weight / np.sum( np.abs(feature_weight)) gene_atb.matrix[:, i] = ( gene_atb.matrix[:, similarity_hit] * (feature_weight.reshape(1, -1))).sum(1) else: print(' no similar features...', flush=True) fw.write('\t'.join([ '{0}|{1:1.6g}'.format(f, v) for f, v in zip(similar_features, similarity_values) ]) + '\n') similarity_hit[i] = False tobediscarded = np.logical_or(tobediscarded, similarity_hit) # discard features absorbed into group features print('discarding features absorbed into group features...', flush=True) if tobediscarded.any(): # discard features print(' discarding {0!s} features. {1!s} features remaining...'. format(tobediscarded.sum(), (~tobediscarded).sum()), flush=True) gene_atb.discard(tobediscarded, axis=1) else: # keep all features print(' no features to discard. {0!s} features remaining...'. format(gene_atb.shape[1]), flush=True) # save if dataset has content print('saving if dataset has content...', flush=True) if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0: # no content print(' nothing to save...', flush=True) else: # save nonredundant features print(' saving {0!s} nonredundant features...'.format( gene_atb.shape[1]), flush=True) dataset_info['path'] = '{0}/{1}.txt.gz'.format( results_folder, dataset_info['abbreviation']) dataset_info['nonredundant_genes'] = gene_atb.shape[0] dataset_info['nonredundant_features'] = gene_atb.shape[1] datasetIO.save_datamatrix(dataset_info['path'], gene_atb) datasetIO.append_datasetinfo( '{0}/dataset_info.txt'.format(results_folder), dataset_info) print('done.', flush=True)
def main(): # load class examples print('loading class examples...', flush=True) class_examples_folder = 'targets/pharmaprojects' class_examples = { 'positive': datasetIO.load_examples( '{0}/positive.txt'.format(class_examples_folder)), 'negative': datasetIO.load_examples( '{0}/negative.txt'.format(class_examples_folder)), 'unknown': datasetIO.load_examples( '{0}/unknown.txt'.format(class_examples_folder)) } # load dataset info print('loading dataset info...', flush=True) dataset_info_path = 'datasets/harmonizome/dataset_info.txt' dataset_infos = datasetIO.load_datasetinfo(dataset_info_path) # specify results folder print('specifying results folder...', flush=True) results_folder = 'datasets/candidate_features' if not os.path.exists(results_folder): os.mkdir(results_folder) # iterate over datasets print('iterating over datasets...', flush=True) for dataset_info in dataset_infos: # # just work with hpatissuesmrna for testing/debugging the pipeline # if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned': # print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True) # continue # check if another python instance is already working on this dataset if os.path.exists('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation'])): print('skipping {0}. already in progress...'.format( dataset_info['abbreviation']), flush=True) continue # log start of processing with open('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: print('working on {0}...'.format(dataset_info['abbreviation']), flush=True) fw.write('working on {0}...'.format(dataset_info['abbreviation'])) # load dataset print('loading dataset...', flush=True) gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path']) dataset_info['original_genes'] = gene_atb.shape[0] dataset_info['original_features'] = gene_atb.shape[1] # decide feature normalization print('deciding feature normalization...', flush=True) if ('standardized' in dataset_info['abbreviation'] or 'cleaned' in dataset_info['abbreviation'] ) and (gene_atb.matrix == 0).sum() / gene_atb.size <= 0.5: # dataset is many-valued and filled-in print(' dataset is many-valued and filled-in...', flush=True) print(' z-scoring features...', flush=True) dataset_info['feature_normalization'] = 'z-score' mnv = np.nanmean(gene_atb.matrix, axis=0, keepdims=True) sdv = np.nanstd(gene_atb.matrix, axis=0, keepdims=True) gene_atb.matrix = (gene_atb.matrix - mnv) / sdv gene_atb.columnmeta['mean'] = mnv.reshape(-1) gene_atb.columnmeta['stdv'] = sdv.reshape(-1) else: # dataset is binary or tertiary or sparse print(' dataset is binary, tertiary, or sparse...', flush=True) print(' no feature normalization...', flush=True) dataset_info['feature_normalization'] = 'none' # assign class labels to genes print('assigning class labels to genes...', flush=True) gene_atb.rowmeta['class'] = np.full(gene_atb.shape[0], 'unknown', dtype='object') gene_atb.rowmeta['class'][np.in1d( gene_atb.rowlabels, list(class_examples['positive']))] = 'positive' gene_atb.rowmeta['class'][np.in1d( gene_atb.rowlabels, list(class_examples['negative']))] = 'negative' # add dataset mean and stdv as features print('adding dataset mean and stdv as features...', flush=True) gene_stat = dataclasses.datamatrix( rowname=gene_atb.rowname, rowlabels=gene_atb.rowlabels.copy(), rowmeta=copy.deepcopy(gene_atb.rowmeta), columnname=gene_atb.columnname, columnlabels=np.array(['mean', 'stdv'], dtype='object'), columnmeta={}, matrixname=gene_atb.matrixname, matrix=np.append(gene_atb.matrix.mean(1, keepdims=True), gene_atb.matrix.std(1, keepdims=True), 1)) gene_atb.append(gene_stat, 1) gene_atb.columnmeta['isrowstat'] = np.in1d(gene_atb.columnlabels, gene_stat.columnlabels) del gene_stat # identify features with little information about labelled examples print( 'identifying features with little information about labelled examples...', flush=True) isunknown = gene_atb.rowmeta['class'] == 'unknown' tobediscarded = np.logical_or.reduce( ((gene_atb.matrix[~isunknown, :] != 0).sum(axis=0) < 3, (gene_atb.matrix[~isunknown, :] != 1).sum(axis=0) < 3, np.isnan(gene_atb.matrix[~isunknown, :]).any(axis=0))) if tobediscarded.any(): # discard features print(' discarding {0!s} features. {1!s} features remaining...'. format(tobediscarded.sum(), (~tobediscarded).sum()), flush=True) gene_atb.discard(tobediscarded, axis=1) else: # keep all features print(' no features to discard. {0!s} features remaining...'. format(gene_atb.shape[1]), flush=True) # save if dataset has content print('saving if dataset has content...', flush=True) if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0: # no content print(' nothing to save...', flush=True) else: # save candidate features print(' saving {0!s} candidate features...'.format( gene_atb.shape[1]), flush=True) dataset_info['path'] = '{0}/{1}.txt.gz'.format( results_folder, dataset_info['abbreviation']) dataset_info['candidate_genes'] = gene_atb.shape[0] dataset_info['candidate_features'] = gene_atb.shape[1] dataset_info['positive_examples'] = ( gene_atb.rowmeta['class'] == 'positive').sum() dataset_info['negative_examples'] = ( gene_atb.rowmeta['class'] == 'negative').sum() dataset_info['unknown_examples'] = ( gene_atb.rowmeta['class'] == 'unknown').sum() datasetIO.save_datamatrix(dataset_info['path'], gene_atb) datasetIO.append_datasetinfo( '{0}/dataset_info.txt'.format(results_folder), dataset_info) print('done.', flush=True)
def main(validation_rep=0, validation_fold=0): # load dataset info print('loading dataset info...', flush=True) dataset_info_path = 'datasets/merged_features/rep{0!s}_fold{1!s}/dataset_info.txt'.format( validation_rep, validation_fold) dataset_info = datasetIO.load_datasetinfo(dataset_info_path)[0] # load validation examples print('loading validation examples...', flush=True) validation_examples_path = 'targets/validation_examples/rep{0!s}_fold{1!s}.txt'.format( validation_rep, validation_fold) with open(validation_examples_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: validation_examples = fr.read().split('\n') # specify results folder print('specifying results folder...', flush=True) results_folder = 'datasets/useful_features/rep{0!s}_fold{1!s}'.format( validation_rep, validation_fold) results_folder_parts = results_folder.split('/') for i in range(len(results_folder_parts)): results_folder_part = '/'.join(results_folder_parts[:i + 1]) if not os.path.exists(results_folder_part): os.mkdir(results_folder_part) # load dataset print('loading dataset {0}...'.format(dataset_info['abbreviation']), flush=True) gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path']) # specify cross-validation parameters print('specifying cross-validation parameters...', flush=True) reps = 20 folds = 5 rf_trees = 1000 include_logistic_regression = True skf = StratifiedKFold(n_splits=folds, shuffle=True) print(' reps: {0!s}'.format(reps)) print(' folds: {0!s}'.format(folds)) # initialize models print('initializing models...', flush=True) rfmodel = RandomForestClassifier(n_estimators=rf_trees, oob_score=False, n_jobs=-1, class_weight='balanced') print(rfmodel) lrmodel = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1e3, fit_intercept=True, intercept_scaling=1e3, class_weight='balanced', random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1) print(lrmodel) # initialize data matrices for collecting model feature importances and cross-validation performance stats print( 'initializing data matrices for collecting model feature importances and cross-validation performance stats...', flush=True) classifier_stats = np.array([ 'p', 'n', 'ap', 'an', 'pp', 'pn', 'tp', 'fp', 'tn', 'fn', 'tpr', 'fpr', 'auroc', 'fnr', 'tnr', 'mcr', 'acc', 'fdr', 'ppv', 'auprc', 'fomr', 'npv', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1s', 'mcc', 'fnlp' ], dtype='object') sm = dataclasses.datamatrix( rowname='classifier_performance_stat', rowlabels=classifier_stats.copy(), rowmeta={}, columnname='model', columnlabels=np.array(['M' + str(x) for x in range(gene_atb.shape[1])], dtype='object'), columnmeta={ 'num_features': np.zeros(gene_atb.shape[1], dtype='int64'), 'features': np.full(gene_atb.shape[1], '', dtype='object'), 'oob_score': np.zeros(gene_atb.shape[1], dtype='float64') }, matrixname='crossvalidation_classifier_performance_stats_vs_models', matrix=np.zeros((classifier_stats.size, gene_atb.shape[1]), dtype='float64')) stat_model_rf_mean = copy.deepcopy(sm) stat_model_rf_stdv = copy.deepcopy(sm) stat_model_lr_mean = copy.deepcopy(sm) stat_model_lr_stdv = copy.deepcopy(sm) del sm fm = dataclasses.datamatrix( rowname=gene_atb.columnname, rowlabels=gene_atb.columnlabels.copy(), rowmeta=copy.deepcopy(gene_atb.columnmeta), columnname='model', columnlabels=np.array(['M' + str(x) for x in range(gene_atb.shape[1])], dtype='object'), columnmeta={ 'num_features': np.zeros(gene_atb.shape[1], dtype='int64'), 'features': np.full(gene_atb.shape[1], '', dtype='object'), 'oob_score': np.zeros(gene_atb.shape[1], dtype='float64') }, matrixname='model_feature_importances', matrix=np.zeros((gene_atb.shape[1], gene_atb.shape[1]), dtype='float64')) feature_model_rf = copy.deepcopy(fm) feature_model_lr = copy.deepcopy(fm) del fm # exclude validation and unlabeled examples from cross-validation loop print( 'excluding validation and unlabeled examples from cross-validation loop...', flush=True) isvalidation = np.in1d(gene_atb.rowlabels, validation_examples) isunknown = gene_atb.rowmeta['class'] == 'unknown' istraintest = ~np.logical_or(isvalidation, isunknown) Y = (gene_atb.rowmeta['class'][istraintest] == 'positive') #X = gene_atb.matrix[istraintest,:] # perform incremental feature elimination with cross-validation print( 'performing incremental feature elimination with cross-validation...', flush=True) for i in range(gene_atb.shape[1]): print(' features: {0!s}...'.format(gene_atb.shape[1] - i), flush=True) if i == 0: hit_rf = np.ones(gene_atb.shape[1], dtype='bool') hit_lr = np.ones(gene_atb.shape[1], dtype='bool') else: hit_rf = feature_model_rf.matrix[:, i - 1] > feature_model_rf.matrix[ feature_model_rf. matrix[:, i - 1] > 0, i - 1].min() #hit_lr = feature_model_lr.matrix[:,i-1] > feature_model_lr.matrix[feature_model_lr.matrix[:,i-1] > 0,i-1].min() hit_lr = hit_rf X_rf = gene_atb.matrix[istraintest, :][:, hit_rf] X_lr = gene_atb.matrix[istraintest, :][:, hit_lr] stat_rep_rf = np.zeros((classifier_stats.size, reps), dtype='float64') stat_rep_lr = np.zeros((classifier_stats.size, reps), dtype='float64') fi_rep_rf = np.zeros((X_rf.shape[1], reps), dtype='float64') fi_rep_lr = np.zeros((X_lr.shape[1], reps), dtype='float64') for rep in range(reps): print(' rep {0!s} of {1!s}...'.format(rep + 1, reps), flush=True) Ptest_rf = np.zeros(Y.size, dtype='float64') Ptest_lr = np.zeros(Y.size, dtype='float64') fi_fold_rf = np.zeros((X_rf.shape[1], folds), dtype='float64') fi_fold_lr = np.zeros((X_lr.shape[1], folds), dtype='float64') for fold, (train_indices, test_indices) in enumerate(skf.split(X_rf, Y)): print(' fold {0!s} of {1!s}...'.format( fold + 1, folds), flush=True) Y_train = Y[train_indices] X_rf_train = X_rf[train_indices] X_lr_train = X_lr[train_indices] #Y_test = Y[test_indices] X_rf_test = X_rf[test_indices] X_lr_test = X_lr[test_indices] rfmodel.fit(X_rf_train, Y_train) Ptest_rf[test_indices] = rfmodel.predict_proba( X_rf_test)[:, rfmodel.classes_ == 1].reshape(-1) fi_fold_rf[:, fold] = rfmodel.feature_importances_ lrmodel.fit(X_lr_train, Y_train) Ptest_lr[test_indices] = lrmodel.predict_proba( X_lr_test)[:, lrmodel.classes_ == 1].reshape(-1) fi_fold_lr[:, fold] = np.abs(lrmodel.coef_.reshape(-1)) fi_rep_rf[:, rep] = fi_fold_rf.mean(1) stat_cut = modelevaluation.get_classifier_performance_stats( Y=Y, P=Ptest_rf, classifier_stats=classifier_stats, plot_curves=False, get_priority_cutoffs=True) stat_rep_rf[:, rep] = stat_cut.matrix[:, stat_cut.columnmeta[ 'p50_cutoff']].reshape(-1) fi_rep_lr[:, rep] = fi_fold_lr.mean(1) stat_cut = modelevaluation.get_classifier_performance_stats( Y=Y, P=Ptest_lr, classifier_stats=classifier_stats, plot_curves=False, get_priority_cutoffs=True) stat_rep_lr[:, rep] = stat_cut.matrix[:, stat_cut.columnmeta[ 'p50_cutoff']].reshape(-1) feature_model_rf.matrix[hit_rf, i] = fi_rep_rf.mean(1) feature_model_rf.columnmeta['num_features'][i] = gene_atb.shape[1] - i feature_model_rf.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_rf].tolist()) stat_model_rf_mean.matrix[:, i] = stat_rep_rf.mean(1) stat_model_rf_mean.columnmeta['num_features'][ i] = gene_atb.shape[1] - i stat_model_rf_mean.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_rf].tolist()) stat_model_rf_stdv.matrix[:, i] = stat_rep_rf.std(1) stat_model_rf_stdv.columnmeta['num_features'][ i] = gene_atb.shape[1] - i stat_model_rf_stdv.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_rf].tolist()) feature_model_lr.matrix[hit_lr, i] = fi_rep_lr.mean(1) feature_model_lr.columnmeta['num_features'][i] = gene_atb.shape[1] - i feature_model_lr.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_lr].tolist()) stat_model_lr_mean.matrix[:, i] = stat_rep_lr.mean(1) stat_model_lr_mean.columnmeta['num_features'][ i] = gene_atb.shape[1] - i stat_model_lr_mean.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_lr].tolist()) stat_model_lr_stdv.matrix[:, i] = stat_rep_lr.std(1) stat_model_lr_stdv.columnmeta['num_features'][ i] = gene_atb.shape[1] - i stat_model_lr_stdv.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_lr].tolist()) # concatenate data matrices with model feature importances print('concatenating data matrices with model feature importances...', flush=True) feature_model_rf.columnlabels += '_rf' feature_model_rf.columnmeta['model_type'] = np.full( feature_model_rf.shape[1], 'random_forest', dtype='object') feature_model_lr.columnlabels += '_lr' feature_model_lr.columnmeta['model_type'] = np.full( feature_model_lr.shape[1], 'logistic_regression', dtype='object') feature_model_rf.append(feature_model_lr, 1) feature_model = feature_model_rf del feature_model_rf, feature_model_lr # concatenate data matrices with model cross-validation performance stats print( 'concatenating data matrices with model cross-validation performance stats...', flush=True) stat_model_rf_mean.rowlabels += '_mean' stat_model_rf_stdv.rowlabels += '_stdv' stat_model_rf_mean.append(stat_model_rf_stdv, 0) stat_model_rf_mean.columnlabels += '_rf' stat_model_rf_mean.columnmeta['model_type'] = np.full( stat_model_rf_mean.shape[1], 'random_forest', dtype='object') stat_model_lr_mean.rowlabels += '_mean' stat_model_lr_stdv.rowlabels += '_stdv' stat_model_lr_mean.append(stat_model_lr_stdv, 0) stat_model_lr_mean.columnlabels += '_lr' stat_model_lr_mean.columnmeta['model_type'] = np.full( stat_model_lr_mean.shape[1], 'logistic_regression', dtype='object') stat_model_rf_mean.append(stat_model_lr_mean, 1) stat_model = stat_model_rf_mean del stat_model_rf_mean # select simplest model (fewest features) with auroc and auprc within 95% of max print( 'selecting simplest model (fewest features) with auroc and auprc within 95% of max...', flush=True) model_scores = 0.5 * (stat_model.select('auroc_mean', []) + stat_model.select('auprc_mean', [])) if include_logistic_regression: selected_model_index = np.where( model_scores >= 0.95 * model_scores.max())[0][-1] else: selected_model_index = np.where( np.logical_and( model_scores >= 0.95 * model_scores[stat_model.columnmeta['model_type'] == 'random_forest'].max(), stat_model.columnmeta['model_type'] == 'random_forest'))[0][-1] selected_model_name = stat_model.columnlabels[selected_model_index] selected_model_features = feature_model.rowlabels[ feature_model.matrix[:, selected_model_index] != 0] selected_model_type = stat_model.columnmeta['model_type'][ selected_model_index] selected_model = rfmodel if selected_model_type == 'random_forest' else lrmodel gene_atb = gene_atb.tolabels(columnlabels=selected_model_features) feature_model_selected = feature_model.tolabels( columnlabels=selected_model_name) stat_model_selected = stat_model.tolabels(columnlabels=selected_model_name) print(' selected_model_name: {0}'.format(selected_model_name), flush=True) print(' selected_model_features: {0}'.format( '|'.join(selected_model_features)), flush=True) # iterate over selected features to rebuild design matrix print('iterating over selected features to rebuild design matrix...', flush=True) for i, (selected_feature, dataset_abbreviation) in enumerate( zip(gene_atb.columnlabels, gene_atb.columnmeta['dataset_abbreviation'])): # load dataset print(' loading dataset {0}...'.format(dataset_abbreviation), flush=True) dataset_path = 'datasets/generalizable_features/rep{0!s}_fold{1!s}/{2}.txt.gz'.format( validation_rep, validation_fold, dataset_abbreviation) gene_atb_i = datasetIO.load_datamatrix(dataset_path) gene_atb_i.columnmeta[ 'generalizability_pvalues_corrected'] = gene_atb_i.columnmeta[ 'generalizability_pvalues_corrected'].astype('float64') gene_atb_i.columnmeta['dataset_abbreviation'] = np.full( gene_atb_i.shape[1], dataset_abbreviation, dtype='object') gene_atb_i.columnmeta[ 'dataset_feature'] = gene_atb_i.columnlabels.copy() gene_atb_i.columnlabels += '_' + dataset_abbreviation gene_atb_i.rowname = 'GeneSym' gene_atb_i.columnname = 'Feature' if dataset_abbreviation == 'gtextissue_cleaned': gene_atb_i.discard(gene_atb_i.rowlabels == 'C12ORF55', 0) # pesky duplicate row print(gene_atb_i) # select feature print(' selecting feature {0}...'.format(selected_feature), flush=True) gene_atb_i.discard(gene_atb_i.columnlabels != selected_feature, 1) # merge dataset print(' merging dataset...', flush=True) if i == 0: gene_atb_selected = copy.deepcopy(gene_atb_i) gene_atb_selected.matrixname = 'merged_target_features' print(' first dataset, no merge...', flush=True) else: common_genes = np.intersect1d(gene_atb_selected.rowlabels, gene_atb_i.rowlabels) gene_atb_selected = gene_atb_selected.tolabels( rowlabels=common_genes) gene_atb_i = gene_atb_i.tolabels(rowlabels=common_genes) gene_atb_selected.append(gene_atb_i, 1) print(' common_genes: {0!s}...'.format(common_genes.size), flush=True) # normalize features print('normalizing features...', flush=True) gene_atb_selected.columnmeta['min'] = gene_atb_selected.matrix.min(0) gene_atb_selected.columnmeta['max'] = gene_atb_selected.matrix.max(0) gene_atb_selected.matrix = ( gene_atb_selected.matrix - gene_atb_selected.columnmeta['min'].reshape( 1, -1)) / (gene_atb_selected.columnmeta['max'].reshape(1, -1) - gene_atb_selected.columnmeta['min'].reshape(1, -1)) # update metadata print('updating metadata...', flush=True) assert (gene_atb.columnlabels == gene_atb_selected.columnlabels).all() for field, values in gene_atb.columnmeta.items(): if field not in gene_atb_selected.columnmeta: gene_atb_selected.columnmeta[field] = values print('old_num_genes:{0!s}\tnew_num_genes:{1!s}'.format( gene_atb.shape[0], gene_atb_selected.shape[0]), flush=True) del gene_atb # refit selected model print('refitting selected model...', flush=True) isvalidation = np.in1d(gene_atb_selected.rowlabels, validation_examples) isunknown = gene_atb_selected.rowmeta['class'] == 'unknown' istraintest = ~np.logical_or(isvalidation, isunknown) selected_model.fit( gene_atb_selected.matrix[istraintest, :], gene_atb_selected.rowmeta['class'][istraintest] == 'positive') # get predictions for validation and unlabelled examples print('getting predictions for validation and unlabelled examples...', flush=True) gene_model_selected = dataclasses.datamatrix( rowname=gene_atb_selected.rowname, rowlabels=gene_atb_selected.rowlabels.copy(), rowmeta=copy.deepcopy(gene_atb_selected.rowmeta), columnname=stat_model_selected.columnname, columnlabels=stat_model_selected.columnlabels.copy(), columnmeta=copy.deepcopy(stat_model_selected.columnmeta), matrixname= 'success_probabilities_for_validation_and_unlabelled_examples', matrix=selected_model.predict_proba( gene_atb_selected.matrix)[:, selected_model.classes_ == 1]) gene_model_selected.discard(istraintest, 0) # save results print('saving {0!s} useful features and model results...'.format( gene_atb_selected.shape[1]), flush=True) dataset_info['path'] = '{0}/{1}.txt.gz'.format( results_folder, dataset_info['abbreviation']) dataset_info['selected_model_name'] = selected_model_name dataset_info['selected_model_features'] = '|'.join(selected_model_features) dataset_info['selected_model_type'] = selected_model_type dataset_info['crossvalidation_reps'] = reps dataset_info['crossvalidation_folds'] = folds dataset_info['rf_trees'] = rf_trees dataset_info['include_logistic_regression'] = include_logistic_regression for stat_name, stat_values in zip(stat_model_selected.rowlabels, stat_model_selected.matrix): dataset_info[stat_name] = stat_values.item() datasetIO.save_datamatrix(dataset_info['path'], gene_atb_selected) datasetIO.save_datamatrix('{0}/stat_model.txt.gz'.format(results_folder), stat_model) datasetIO.save_datamatrix( '{0}/feature_model.txt.gz'.format(results_folder), feature_model) datasetIO.save_datamatrix( '{0}/stat_model_selected.txt.gz'.format(results_folder), stat_model_selected) datasetIO.save_datamatrix( '{0}/feature_model_selected.txt.gz'.format(results_folder), feature_model_selected) datasetIO.save_datamatrix( '{0}/gene_model_selected.txt.gz'.format(results_folder), gene_model_selected) datasetIO.append_datasetinfo('{0}/dataset_info.txt'.format(results_folder), dataset_info) print('done.', flush=True)
def main(validation_rep=0, validation_fold=0): # load target clusters print('loading target cluster assignments...', flush=True) target_cluster_path = 'targets/clusters/gene_cluster_byfamily.pickle' gene_cluster = datasetIO.load_clusterassignments(target_cluster_path) # load dataset info print('loading dataset info...', flush=True) dataset_info_path = 'datasets/nonredundant_features/dataset_info.txt' dataset_infos = datasetIO.load_datasetinfo(dataset_info_path) # load validation examples print('loading validation examples...', flush=True) validation_examples_path = 'targets/validation_examples/rep{0!s}_fold{1!s}.txt'.format( validation_rep, validation_fold) with open(validation_examples_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: validation_examples = fr.read().split('\n') # specify results folder print('specifying results folder...', flush=True) results_folder = 'datasets/generalizable_features/rep{0!s}_fold{1!s}'.format( validation_rep, validation_fold) results_folder_parts = results_folder.split('/') for i in range(len(results_folder_parts)): results_folder_part = '/'.join(results_folder_parts[:i + 1]) if not os.path.exists(results_folder_part): os.mkdir(results_folder_part) # iterate over datasets print('iterating over datasets...', flush=True) for dataset_info in dataset_infos: # # just work with hpatissuesmrna for testing/debugging the pipeline # if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned': # print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True) # continue # check if another python instance is already working on this dataset if os.path.exists('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation'])): print('skipping {0}. already in progress...'.format( dataset_info['abbreviation']), flush=True) continue # log start of processing with open('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: print('working on {0}...'.format(dataset_info['abbreviation']), flush=True) fw.write('working on {0}...'.format(dataset_info['abbreviation'])) # load dataset print('loading dataset...', flush=True) gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path']) # specify feature generalizability test parameters print('specifying feature generalizability test parameters...', flush=True) dataset_info[ 'feature_generalizability_test_function'] = featureselection.univariate_grouppreserved_permtest dataset_info[ 'feature_generalizability_test_permutations'] = 10000 # 100000 dataset_info[ 'feature_generalizability_test_targetclusterpath'] = target_cluster_path dataset_info[ 'multiple_hypothesis_testing_correction_function'] = featureselection.multiple_hypothesis_testing_correction dataset_info[ 'multiple_hypothesis_testing_correction_method'] = 'fdr_by' dataset_info['multiple_hypothesis_testing_correction_threshold'] = 0.05 print(' feature_generalizability_test_function: {0}'.format( dataset_info['feature_generalizability_test_function']), flush=True) print(' feature_generalizability_test_permutations: {0!s}'.format( dataset_info['feature_generalizability_test_permutations']), flush=True) print(' feature_generalizability_test_targetclusterpath: {0}'.format( dataset_info['feature_generalizability_test_targetclusterpath']), flush=True) print(' multiple_hypothesis_testing_correction_function: {0}'.format( dataset_info['multiple_hypothesis_testing_correction_function']), flush=True) print(' multiple_hypothesis_testing_correction_method: {0}'.format( dataset_info['multiple_hypothesis_testing_correction_method']), flush=True) print(' multiple_hypothesis_testing_correction_threshold: {0!s}'. format(dataset_info[ 'multiple_hypothesis_testing_correction_threshold']), flush=True) # exclude validation and unlabeled examples from significance calculation print( 'excluding validation and unlabeled examples from significance calculation...', flush=True) isvalidation = np.in1d(gene_atb.rowlabels, validation_examples) isunknown = gene_atb.rowmeta['class'] == 'unknown' istraintest = ~np.logical_or(isvalidation, isunknown) # compute feature generalizability with multiple hypothesis testing correction print( 'computing feature generalizability with multiple hypothesis testing correction...', flush=True) gene_atb.rowmeta['cluster'] = np.array([ gene_cluster[g] if g in gene_cluster else -1 for g in gene_atb.rowlabels ], dtype='int64') gene_atb.columnmeta[ 'generalizability_test_statistic_values'], gene_atb.columnmeta[ 'generalizability_pvalues'] = dataset_info[ 'feature_generalizability_test_function']( X=gene_atb.matrix[istraintest, :], Y=(gene_atb.rowmeta['class'][istraintest] == 'positive' ), G=gene_atb.rowmeta['cluster'][istraintest], numperm=dataset_info[ 'feature_generalizability_test_permutations']) gene_atb.columnmeta['is_generalizable'], gene_atb.columnmeta[ 'generalizability_pvalues_corrected'] = dataset_info[ 'multiple_hypothesis_testing_correction_function']( gene_atb.columnmeta['generalizability_pvalues'], alpha=dataset_info[ 'multiple_hypothesis_testing_correction_threshold'], method=dataset_info[ 'multiple_hypothesis_testing_correction_method']) gene_atb.columnmeta['generalizability_correlation_sign'] = np.sign( gene_atb.columnmeta['generalizability_test_statistic_values']) if (gene_atb.columnmeta['generalizability_pvalues'] < 1 / dataset_info['feature_generalizability_test_permutations'] ).any(): print( ' warning: not enough permutations to establish all pvalues...', flush=True) tobediscarded = np.logical_or( np.isnan(gene_atb.columnmeta['generalizability_pvalues']), np.isnan( gene_atb.columnmeta['generalizability_pvalues_corrected'])) if tobediscarded.any(): gene_atb.discard(tobediscarded, axis=1) # prioritize features print('prioritizing features...', flush=True) sortedindices = np.argsort( gene_atb.columnmeta['generalizability_pvalues_corrected']) gene_atb.reorder(sortedindices, axis=1) # save feature generalizability info print('saving feature generalizability info...', flush=True) with open('{0}/{1}_feature_generalizability_info.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: writelist = [ 'dataset', 'abbreviation', 'feature', 'generalizability_test_statistic', 'generalizability_pvalue', 'generalizability_pvalue_corrected', 'is_generalizable', 'generalizability_correlation_sign', 'preferred_rowstat', 'similar_features' ] fw.write('\t'.join(writelist) + '\n') for j, feature in enumerate(gene_atb.columnlabels): writelist = [ dataset_info['name'], dataset_info['abbreviation'], feature, '{0:1.5g}'.format(gene_atb.columnmeta[ 'generalizability_test_statistic_values'][j]), '{0:1.5g}'.format( gene_atb.columnmeta['generalizability_pvalues'][j]), '{0:1.5g}'.format( gene_atb. columnmeta['generalizability_pvalues_corrected'][j]), '{0:1.5g}'.format( gene_atb.columnmeta['is_generalizable'][j]), '{0:1.5g}'.format( gene_atb. columnmeta['generalizability_correlation_sign'][j]), gene_atb.columnmeta['preferred_rowstat'][j], gene_atb.columnmeta['similar_features'][j] ] fw.write('\t'.join(writelist) + '\n') # discard features that are not generalizable print('discarding features that are not generalizable...', flush=True) tobediscarded = ~gene_atb.columnmeta['is_generalizable'] if tobediscarded.any(): # discard features print(' discarding {0!s} features. {1!s} features remaining...'. format(tobediscarded.sum(), (~tobediscarded).sum()), flush=True) gene_atb.discard(tobediscarded, axis=1) else: # keep all features print(' no features to discard. {0!s} features remaining...'. format(gene_atb.shape[1]), flush=True) # save if dataset has content print('saving if dataset has content...', flush=True) if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0: # no content print(' nothing to save...', flush=True) else: # save generalizable features print(' saving {0!s} generalizable features...'.format( gene_atb.shape[1]), flush=True) dataset_info['path'] = '{0}/{1}.txt.gz'.format( results_folder, dataset_info['abbreviation']) dataset_info['generalizable_genes'] = gene_atb.shape[0] dataset_info['generalizable_features'] = gene_atb.shape[1] dataset_info[ 'feature_generalizability_test_function'] = 'featureselection.univariate_grouppreserved_permtest' dataset_info[ 'multiple_hypothesis_testing_correction_function'] = 'featureselection.multiple_hypothesis_testing_correction' datasetIO.save_datamatrix(dataset_info['path'], gene_atb) datasetIO.append_datasetinfo( '{0}/dataset_info.txt'.format(results_folder), dataset_info) print('done.', flush=True)
def main(validation_rep=0, validation_fold=0): # load dataset info print('loading dataset info...', flush=True) dataset_info_path = 'datasets/generalizable_features/rep{0!s}_fold{1!s}/dataset_info.txt'.format( validation_rep, validation_fold) dataset_infos = datasetIO.load_datasetinfo(dataset_info_path) # specify results folder print('specifying results folder...', flush=True) results_folder = 'datasets/merged_features/rep{0!s}_fold{1!s}'.format( validation_rep, validation_fold) results_folder_parts = results_folder.split('/') for i in range(len(results_folder_parts)): results_folder_part = '/'.join(results_folder_parts[:i + 1]) if not os.path.exists(results_folder_part): os.mkdir(results_folder_part) # exclude mouse and small datasets print('excluding mouse datasets and datasets with few genes...', flush=True) dataset_infos = [ dataset_info for dataset_info in dataset_infos if 'mouse' not in dataset_info['abbreviation'] and int(dataset_info['generalizable_genes']) > 1900 ] # exclude brain atlas datasets unless they're the only choice not_brainatlas = [ 'brainatlas' not in dataset_info['abbreviation'] for dataset_info in dataset_infos ] if sum(not_brainatlas) > 0: print('excluding brain atlas datasets...', flush=True) dataset_infos = [ dataset_info for dataset_info, nba in zip(dataset_infos, not_brainatlas) if nba ] # iterate over datasets print('iterating over datasets...', flush=True) for i, dataset_info in enumerate(dataset_infos): # load dataset print('loading dataset {0}...'.format(dataset_info['abbreviation']), flush=True) gene_atb_i = datasetIO.load_datamatrix( datasetpath=dataset_info['path']) gene_atb_i.columnmeta[ 'generalizability_pvalues_corrected'] = gene_atb_i.columnmeta[ 'generalizability_pvalues_corrected'].astype('float64') gene_atb_i.columnmeta['dataset_abbreviation'] = np.full( gene_atb_i.shape[1], dataset_info['abbreviation'], dtype='object') gene_atb_i.columnmeta[ 'dataset_feature'] = gene_atb_i.columnlabels.copy() gene_atb_i.columnlabels += '_' + dataset_info['abbreviation'] gene_atb_i.rowname = 'GeneSym' gene_atb_i.columnname = 'Feature' if dataset_info['abbreviation'] == 'gtextissue_cleaned': gene_atb_i.discard(gene_atb_i.rowlabels == 'C12ORF55', 0) # pesky duplicate row print(gene_atb_i) # merge dataset print('merging dataset...', flush=True) if i == 0: gene_atb = copy.deepcopy(gene_atb_i) gene_atb.matrixname = 'merged_target_features' print(' first dataset, no merge...', flush=True) else: common_genes = np.intersect1d(gene_atb.rowlabels, gene_atb_i.rowlabels) gene_atb = gene_atb.tolabels(rowlabels=common_genes) gene_atb_i = gene_atb_i.tolabels(rowlabels=common_genes) gene_atb.append(gene_atb_i, 1) print(' common_genes: {0!s}...'.format(common_genes.size), flush=True) # specify merged dataset info print('specifying merged dataset info...', flush=True) dataset_info = { 'abbreviation': 'merged', 'name': 'Merged Generalizable Target Features', 'path': '{0}/{1}.txt.gz'.format(results_folder, 'merged'), 'feature_normalization': 'min-max', 'feature_similarity_metric': 'cosine', 'feature_similarity_threshold': np.sqrt(0.5), 'genes': gene_atb.shape[0], 'features': gene_atb.shape[1], 'positives': (gene_atb.rowmeta['class'] == 'positive').sum(), 'negatives': (gene_atb.rowmeta['class'] == 'negative').sum(), 'unknowns': (gene_atb.rowmeta['class'] == 'unknown').sum() } for field, entry in dataset_info.items(): print(' {0}: {1!s}'.format(field, entry), flush=True) # normalize features print('normalizing features...', flush=True) gene_atb.columnmeta['min'] = gene_atb.matrix.min(0) gene_atb.columnmeta['max'] = gene_atb.matrix.max(0) gene_atb.matrix = (gene_atb.matrix - gene_atb.columnmeta['min'].reshape( 1, -1)) / (gene_atb.columnmeta['max'].reshape(1, -1) - gene_atb.columnmeta['min'].reshape(1, -1)) # prioritize features print('prioritizing features by generalizability_pvalues_corrected...', flush=True) sortedindices = np.argsort( gene_atb.columnmeta['generalizability_pvalues_corrected']) gene_atb.reorder(sortedindices, axis=1) # calculate feature similarity print('calculating feature similarity...', flush=True) atb_atb = gene_atb.tosimilarity( axis=1, metric=dataset_info['feature_similarity_metric']) # prioritize feature groups print('prioritizing feature groups...', flush=True) are_similar_features = np.abs( atb_atb.matrix) > dataset_info['feature_similarity_threshold'] feature_group_size = are_similar_features.sum(1).astype('float64') feature_group_score = (np.abs(atb_atb.matrix) * are_similar_features).sum(1) / feature_group_size feature_priority = np.zeros(gene_atb.shape[1], dtype='float64') feature_priority[gene_atb.columnmeta['dataset_feature'] == 'mean'] = 1.0 feature_priority[gene_atb.columnmeta['dataset_feature'] == 'stdv'] = 0.5 feature_infos = list( zip(np.arange(gene_atb.shape[1], dtype='int64'), gene_atb.columnlabels.copy(), feature_group_size.copy(), feature_priority.copy(), feature_group_score.copy())) feature_infos.sort(key=itemgetter(4), reverse=True) feature_infos.sort(key=itemgetter(3), reverse=True) feature_infos.sort(key=itemgetter(2), reverse=True) # for feature_info in feature_infos: # print('{0:1.3g}, {1}, {2:1.3g}, {3:1.3g}, {4:1.3g}'.format(feature_info[0], feature_info[1], feature_info[2], feature_info[3], feature_info[4])) sorted_feature_indices = np.array( [feature_info[0] for feature_info in feature_infos], dtype='int64') atb_atb.reorder(sorted_feature_indices, axis=0) atb_atb.reorder(sorted_feature_indices, axis=1) gene_atb.reorder(sorted_feature_indices, axis=1) are_similar_features = are_similar_features[ sorted_feature_indices, :][:, sorted_feature_indices] # group similar features print('grouping similar features...', flush=True) tobediscarded = np.zeros(gene_atb.shape[1], dtype='bool') gene_atb.columnmeta['similar_features'] = np.full(gene_atb.shape[1], '', dtype='object') with open('{0}/{1}_feature_groups.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: for i, feature in enumerate(gene_atb.columnlabels): if ~tobediscarded[i]: # find similar features print( ' finding features similar to feature "{0}"...'.format( feature), flush=True) similarity_hit = are_similar_features[i, :] similarity_hit = np.logical_and( similarity_hit, ~tobediscarded) # just what's new similarity_hit[:i] = False similar_features = gene_atb.columnlabels[similarity_hit] similarity_values = atb_atb.matrix[i, similarity_hit] generalizability_pvalues_corrected = gene_atb.columnmeta[ 'generalizability_pvalues_corrected'][similarity_hit] si = np.argsort(generalizability_pvalues_corrected) similar_features = similar_features[si] similarity_values = similarity_values[si] generalizability_pvalues_corrected = generalizability_pvalues_corrected[ si] print( ' similar_feature, similarity_value, generalizability_pvalue_corrected', flush=True) for similar_feature, similarity_value, generalizability_pvalue_corrected in zip( similar_features, similarity_values, generalizability_pvalues_corrected): print(' {0}, {1:1.3g}, {2:1.3g}'.format( similar_feature, similarity_value, generalizability_pvalue_corrected), flush=True) # replace feature with best similar feature j = np.where( gene_atb.columnlabels == similar_features[0])[0][0] gene_atb.columnmeta['similar_features'][j] = '|'.join( similar_features.tolist()) print( ' replacing feature "{0}" with best similar feature "{1}"...' .format(feature, gene_atb.columnlabels[j]), flush=True) gene_atb.matrix[:, i] = gene_atb.matrix[:, j] gene_atb.columnlabels[i] = gene_atb.columnlabels[j] for field in gene_atb.columnmeta.keys(): gene_atb.columnmeta[field][i] = gene_atb.columnmeta[field][ j] fw.write('\t'.join([ '{0}|{1:1.6g}|{2:1.6g}'.format(f, s, p) for f, s, p in zip(similar_features, similarity_values, generalizability_pvalues_corrected) ]) + '\n') similarity_hit[i] = False tobediscarded = np.logical_or(tobediscarded, similarity_hit) # discard features absorbed into group features print('discarding features absorbed into group features...', flush=True) if tobediscarded.any(): # discard features print(' discarding {0!s} features. {1!s} features remaining...'. format(tobediscarded.sum(), (~tobediscarded).sum()), flush=True) gene_atb.discard(tobediscarded, axis=1) else: # keep all features print(' no features to discard. {0!s} features remaining...'.format( gene_atb.shape[1]), flush=True) # save if dataset has content print('saving if dataset has content...', flush=True) if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0: # no content print(' nothing to save...', flush=True) else: # save merged nonredundant features print(' saving {0!s} merged nonredundant features...'.format( gene_atb.shape[1]), flush=True) dataset_info['nonredundant_genes'] = gene_atb.shape[0] dataset_info['nonredundant_features'] = gene_atb.shape[1] datasetIO.save_datamatrix(dataset_info['path'], gene_atb) datasetIO.append_datasetinfo( '{0}/dataset_info.txt'.format(results_folder), dataset_info) print('done.', flush=True)
def main(): # load dataset info print('loading dataset info...', flush=True) dataset_info_path = 'datasets/nonredundant_features/dataset_info.txt' dataset_infos = datasetIO.load_datasetinfo(dataset_info_path) # specify results folder print('specifying results folder...', flush=True) results_folder = 'datasets/significant_features' if not os.path.exists(results_folder): os.mkdir(results_folder) # iterate over datasets print('iterating over datasets...', flush=True) for dataset_info in dataset_infos: # # just work with hpatissuesmrna for testing/debugging the pipeline # if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned': # print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True) # continue # check if another python instance is already working on this dataset if os.path.exists('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation'])): print('skipping {0}. already in progress...'.format( dataset_info['abbreviation']), flush=True) continue # log start of processing with open('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: print('working on {0}...'.format(dataset_info['abbreviation']), flush=True) fw.write('working on {0}...'.format(dataset_info['abbreviation'])) # load dataset print('loading dataset...', flush=True) gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path']) # specify feature significance test parameters print('specifying feature significance test parameters...', flush=True) dataset_info[ 'feature_significance_test_function'] = featureselection.univariate_permtest dataset_info['feature_significance_test_permutations'] = 100000 dataset_info[ 'multiple_hypothesis_testing_correction_function'] = featureselection.multiple_hypothesis_testing_correction dataset_info[ 'multiple_hypothesis_testing_correction_method'] = 'fdr_by' dataset_info['multiple_hypothesis_testing_correction_threshold'] = 0.05 print(' feature_significance_test_function: {0}'.format( dataset_info['feature_significance_test_function']), flush=True) print(' feature_significance_test_permutations: {0!s}'.format( dataset_info['feature_significance_test_permutations']), flush=True) print(' multiple_hypothesis_testing_correction_function: {0}'.format( dataset_info['multiple_hypothesis_testing_correction_function']), flush=True) print(' multiple_hypothesis_testing_correction_method: {0}'.format( dataset_info['multiple_hypothesis_testing_correction_method']), flush=True) print(' multiple_hypothesis_testing_correction_threshold: {0!s}'. format(dataset_info[ 'multiple_hypothesis_testing_correction_threshold']), flush=True) # compute feature significance with multiple hypothesis testing correction print( 'computing feature significance with multiple hypothesis testing correction...', flush=True) isunknown = gene_atb.rowmeta['class'] == 'unknown' gene_atb.columnmeta['test_statistic_values'], gene_atb.columnmeta[ 'pvalues'] = dataset_info['feature_significance_test_function']( X=gene_atb.matrix[~isunknown, :], Y=(gene_atb.rowmeta['class'][~isunknown] == 'positive'), numperm=dataset_info['feature_significance_test_permutations']) gene_atb.columnmeta['is_significant'], gene_atb.columnmeta[ 'pvalues_corrected'] = dataset_info[ 'multiple_hypothesis_testing_correction_function']( gene_atb.columnmeta['pvalues'], alpha=dataset_info[ 'multiple_hypothesis_testing_correction_threshold'], method=dataset_info[ 'multiple_hypothesis_testing_correction_method']) gene_atb.columnmeta['correlation_sign'] = np.sign( gene_atb.columnmeta['test_statistic_values']) if (gene_atb.columnmeta['pvalues'] < 1 / dataset_info['feature_significance_test_permutations']).any(): print( ' warning: not enough permutations to establish all pvalues...', flush=True) tobediscarded = np.logical_or( np.isnan(gene_atb.columnmeta['pvalues']), np.isnan(gene_atb.columnmeta['pvalues_corrected'])) if tobediscarded.any(): gene_atb.discard(tobediscarded, axis=1) # prioritize features print('prioritizing features...', flush=True) sortedindices = np.argsort(gene_atb.columnmeta['pvalues_corrected']) gene_atb.reorder(sortedindices, axis=1) # save feature significance info print('saving feature significance info...', flush=True) with open('{0}/{1}_feature_significance_info.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: writelist = [ 'dataset', 'abbreviation', 'feature', 'test_statistic', 'pvalue', 'pvalue_corrected', 'is_significant', 'correlation_sign', 'preferred_rowstat', 'similar_features' ] fw.write('\t'.join(writelist) + '\n') for j, feature in enumerate(gene_atb.columnlabels): writelist = [ dataset_info['name'], dataset_info['abbreviation'], feature, '{0:1.5g}'.format( gene_atb.columnmeta['test_statistic_values'][j]), '{0:1.5g}'.format(gene_atb.columnmeta['pvalues'][j]), '{0:1.5g}'.format( gene_atb.columnmeta['pvalues_corrected'][j]), '{0:1.5g}'.format( gene_atb.columnmeta['is_significant'][j]), '{0:1.5g}'.format( gene_atb.columnmeta['correlation_sign'][j]), gene_atb.columnmeta['preferred_rowstat'][j], gene_atb.columnmeta['similar_features'][j] ] fw.write('\t'.join(writelist) + '\n') # discard features that are not significant print('discarding features that are not significant...', flush=True) tobediscarded = ~gene_atb.columnmeta['is_significant'] if tobediscarded.any(): # discard features print(' discarding {0!s} features. {1!s} features remaining...'. format(tobediscarded.sum(), (~tobediscarded).sum()), flush=True) gene_atb.discard(tobediscarded, axis=1) else: # keep all features print(' no features to discard. {0!s} features remaining...'. format(gene_atb.shape[1]), flush=True) # save if dataset has content print('saving if dataset has content...', flush=True) if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0: # no content print(' nothing to save...', flush=True) else: # save significant features print(' saving {0!s} significant features...'.format( gene_atb.shape[1]), flush=True) dataset_info['path'] = '{0}/{1}.txt.gz'.format( results_folder, dataset_info['abbreviation']) dataset_info['significant_genes'] = gene_atb.shape[0] dataset_info['significant_features'] = gene_atb.shape[1] dataset_info[ 'feature_significance_test_function'] = 'featureselection.univariate_permtest' dataset_info[ 'multiple_hypothesis_testing_correction_function'] = 'featureselection.multiple_hypothesis_testing_correction' datasetIO.save_datamatrix(dataset_info['path'], gene_atb) datasetIO.append_datasetinfo( '{0}/dataset_info.txt'.format(results_folder), dataset_info) print('done.', flush=True)