def main(): '''Run for each ontology and for three levels of term counts. ''' # Load GO DAG go = GO("experimental", "computational", "curated", go_dag_path='$CEREVISIAEDATA/go.obo', associations_path='$CEREVISIAEDATA/gene_association.sgd') go.load_go_dag() go_dag = go.go_dag # Set `vmin` and `vmax` for each level of term counts ontology_sizes = [(101, 300), (31, 100), (11, 30)] # Names of the three ontologies ontologies = ['P', 'F', 'C'] # Save arrays for each ontology and level associations_ontologies_levels = {} # Loop over ontologies for ontology in ontologies: print('Calculating for ontology', ontology) # Get dict mapping genes to GO terms associations = go.get_associations(ontology) # Add parent terms associations = propagate_parent_terms(associations, go_dag) # Get dicts for mapping to array indexes gene_indexes = get_gene_index() go_id_indexes = get_go_id_index(associations) # Loop over levels of term counts for idx, (vmin, vmax) in enumerate(ontology_sizes): print('Min/max term counts', vmin, vmax) # Get associations between genes and GO terms M = np.zeros((max(go_id_indexes.values()) + 1, max(gene_indexes.values()) + 1)) print(M.shape) M = fill_array_of_associations(M, associations, gene_indexes, go_id_indexes) M = get_subarray_by_term_counts(M, vmin, vmax) print('Shape before filtering terms by Jaccard similarity', M.shape) # Filter terms by Jaccard similarity # M = filter_similar_terms(M) print('Shape after filtering terms by Jaccard similarity', M.shape) # Save array associations_ontologies_levels[f'{ontology}_{idx + 1}'] = M.T # Save `associations_ontologies_levels` to a .mat file output_dir = os.path.join(os.path.expandvars('$CEREVISIAEDATA'), 'deepNF', 'annotations') directory_exists(output_dir) output_file = 'yeast_annotations.mat' io.savemat( os.path.join(output_dir, output_file), associations_ontologies_levels, do_compression=True) print(f'{output_file} saved to {output_dir}')
def main(): ###################### # Prepare filesystem # ###################### directory_exists(models_path) mkdir(results_path) ################### # Load embeddings # ################### embeddings_file = glob.glob(os.path.join(models_path, '*.mat'))[0] model_name = os.path.splitext(os.path.basename(embeddings_file))[0] print(model_name) stdout('Loading embeddings', embeddings_file) embeddings = load_embeddings(embeddings_file) embeddings = minmax_scale(embeddings) ####################### # Load GO annotations # ####################### annotation_dir = os.path.join(data_path, 'annotations') if validation == 'cerevisiae': annotation_file = os.path.join(annotation_dir, 'cerevisiae_annotations.mat') else: annotation_file = os.path.join(annotation_dir, 'yeast_annotations.mat') stdout('Loading GO annotations', annotation_file) GO = sio.loadmat(annotation_file) #################### # Train classifier # #################### stdout('Running cross-validation for', level) annotations = GO[level] # Silence certain warning messages during cross-validation for w in (sklearn.exceptions.UndefinedMetricWarning, UserWarning, RuntimeWarning): warnings.filterwarnings("ignore", category=w) # Only use a subset of the data for testing purposes embeddings = embeddings[:test] annotations = annotations[:test] # performance = cross_validation( # embeddings, # annotations, # n_trials=n_trials, # n_jobs=n_jobs, # n_threads=n_threads, # random_state=random_state, # clf_type=clf_type, # max_depth=max_depth[level]) performance = cross_validation(embeddings, annotations, n_trials=n_trials) performance['my_level'] = level pprint(performance) fout = f'{model_name}_{level}_{clf_type}_performance.json' with open(os.path.join(results_path, fout), 'w') as f: json.dump(performance, f)
def main(): # Prepare filesystem directory_exists(models_path) mkdir(results_path) # Load embeddings embeddings_file = glob.glob(os.path.join(models_path, '*.mat'))[0] model_name = os.path.splitext( os.path.basename(embeddings_file))[0].replace('_embeddings', '') stdout('Loading embeddings', embeddings_file) embeddings = load_embeddings(embeddings_file).astype('int32') # Load annotations annotation_dir = os.path.join(data_path, 'annotations') if validation == 'cerevisiae': annotation_file = os.path.join( annotation_dir, 'cerevisiae_annotations.mat') else: annotation_file = os.path.join(annotation_dir, 'yeast_annotations.mat') stdout('Loading GO annotations', annotation_file) annotation_file = sio.loadmat(annotation_file) # Train classifier stdout('Running cross-validation for', level) if validation == 'cv': if level in ('P', 'F', 'C'): annotations = np.hstack( [annotation_file[f'{level}_{i}'] for i in range(1, 4)]) else: annotations = annotation_file[level] elif validation == 'cerevisiae': if level == 'all': annotations = np.hstack( [annotation_file[f'level{i}'] for i in range(1, 4)]) else: annotations = annotation_file[level] annotations = annotations.astype('int32') # Silence certain warning messages during cross-validation for w in (sklearn.exceptions.UndefinedMetricWarning, UserWarning, RuntimeWarning): warnings.filterwarnings("ignore", category=w) # Remove genes with no annotations x = embeddings y = annotations del_rid = np.where(y.sum(axis=1) == 0)[0] x = np.delete(x, del_rid, axis=0) y = np.delete(y, del_rid, axis=0) # Set up CV performance_metrics = ('accuracy', 'm_AUPR', 'M_AUPR', 'f1') performance_repeats = defaultdict(dict) for repeat in range(1, repeats + 1): performance_repeats[f'repeat_{repeat}'] = defaultdict(dict) performance = performance_repeats[f'repeat_{repeat}'] trials = ShuffleSplit(n_splits=n_trials, test_size=0.2, random_state=random_state) iteration = 0 # CV-folds for train_idx, test_idx in trials.split(x): iteration += 1 x_train = x[train_idx] x_test = x[test_idx] y_train = y[train_idx] y_test = y[test_idx] # Define the MLP architecture model = MLP(x_train, y_train) model.compile('adam', 'binary_crossentropy', ['acc']) # Train the model callbacks = [EarlyStopping(min_delta=0., patience=20), ModelCheckpoint('best_model.h5', save_best_only=True)] history = model.fit(x_train, y_train, batch_size=batch_size, epochs=200, validation_split=0.2, shuffle=True, callbacks=callbacks, verbose=2) performance['history'][iteration] = {} for tm in history.history: performance['history'][iteration][tm] = history.history[tm] # Read the best model from file (defined as the model which minimizes # the validation loss. model = load_model('best_model.h5') # Predict annotations y_score = model.predict(x_test) y_pred = y_score.copy() positive_threshold = .5 y_pred[y_pred < positive_threshold] = 0 y_pred[y_pred > 0] = 1 performance_trial = _Performance(y_test, y_score, y_pred) for pm in performance_metrics: performance[pm][iteration] = getattr(performance_trial, pm) calculate_mean_std(performance, pm) dummy = DummyClassifier().fit(x_train, y_train).score(x_test, y_test) performance['dummy'][iteration] = dummy performance['level'] = level pprint(performance) # Save results and training history fout = f'{model_name}_{level}_{clf_type}' with open(os.path.join(results_path, f'{fout}.json'), 'w') as f: json.dump(performance_repeats, f) # Delete the best model file os.remove('best_model.h5') return None
default='$AGAPEDATA/deepNF/networks', type=str) parser.add_argument('-o', '--output-path', default='$AGAPEDATA/deepNF/networks', type=str) parser.add_argument('--K', default=3, type=int) parser.add_argument('--alpha', default=.98, type=float) parser.add_argument('--genes', default=5100, type=int) args = parser.parse_args() ###### # io # ###### input_path = directory_exists(args.input_path) output_path = directory_exists(args.output_path) ######## # defs # ######## def _load_network(filename, mtrx='adj'): print(f"Loading {filename}") i, j, val = np.loadtxt(filename).T if 'fypo' in filename: A = coo_matrix((val, (i, j)), shape=(args.genes, max(j) + 1)) else: A = coo_matrix((val, (i, j)), shape=(args.genes, args.genes))
########################## parser = argparse.ArgumentParser() parser.add_argument('-o', '--output-path', default='$CEREVISIAEDATA/deepNF', type=str) args = parser.parse_args() ###### # io # ###### data = os.environ["CEREVISIAEDATA"] if directory_exists(args.output_path): output_path = os.path.expandvars(args.output_path) ######## # defs # ######## class STRING: """Load S. cerevisiae STRING database. """ def __init__(self): # f = "4932.protein.links.detailed.v10.5.txt" f = "4932.protein.links.detailed.v9.1.txt" self.df = pd.read_csv(os.path.join(data, f), sep=" ") self.interaction_types = ('neighborhood', 'fusion', 'cooccurence',
def test_directory_exists(self): with TemporaryDirectory() as d: assert directory_exists(d) == d
def test_raises_FileNotFoundError(self): with TemporaryDirectory() as d: with raises(FileNotFoundError): directory_exists(d + "NOTAPATH")