import sys import predict.preprocess as prep if __name__ == "__main__": if len(sys.argv) < 5: print("Usage: python <Input predictions file> <Output results file> <Protein/Gene list> <Probability threshold>") sys.exit(0) interactome_file = sys.argv[1] out_file = sys.argv[2] protein_list = sys.argv[3] threshold = float(sys.argv[4]) symbol_type = sys.argv[5] labels = sorted(prep.get_labels_from_file('data/labels.tsv')) train, _ = prep.prep_data_frames(selection=[]) uniprots = list(train['uniprot'].values) use_protein = True try: interactome_fp = open(interactome_file, 'r') except IOError: print("Could not open supplied file {}.".format(interactome_file)) sys.exit(0) try: protein_fp = open(protein_list, 'r') proteins = set() for line in protein_fp: xs = line.strip().upper() proteins.add(xs)
'binary': binary, 'balanced': balanced, 'induce': induce, 'iteration': iterations, 'cv_folds': cv_folds, 'selection': selection, 'ontologies': ontologies, 'vectorizer_method': vectorizer_method, 'permuted': permute, 'scale': scale } pretty_print_dict(config) # ----------------------------- LOAD DATA ----------------------------------- # np.random.seed(42) developement_df, testing_df = prep.prep_data_frames(selection, load_interactome=False) labels = get_labels_from_file('data/labels.tsv') n = len(labels) split_train = {l:0 for l in labels} for l in labels: split_train[l] = sum(developement_df[l].values) split_test = {l:0 for l in labels} for l in labels: split_test[l] = sum(testing_df[l].values) n_samples_train = len(developement_df) n_samples_test = len(testing_df) # Create the appropriate statistics container for the whole experiment.
print("Loading datasets...") df_kegg = pickle.load(open("tmp/train_df.pkl", 'r')) df_hprd = pickle.load(open("tmp/test_df.pkl", 'r')) df_test = pickle.load(open("tmp/interactome_df.pkl", 'r')) selection = ['ipr', 'pfam', 'induced_go_cc', 'induced_go_mf', 'induced_go_bp'] labels = prep.get_labels_from_file("data/labels.tsv") def get_selection(row): terms = [] for col in selection: terms += [row[col]] terms = [t for t in ','.join(terms).split(',') if t.strip() != ''] return ','.join(terms) df_test['terms'] = df_test.apply(get_selection, axis=1) df_train, df_hprd = prep.prep_data_frames(selection) vectorizer = CountVectorizer(stop_words=['go', '', ' '], binary=False, lowercase=True) vectorizer.fit(df_train['terms'].values) print("Transforming features...") x_train, y_train, feature_names, selector = prep.select_features( df = df_train, vectorizer=vectorizer, feature_col='terms', label_col='label', continuous_col=['sim'], alpha=None, percentile=100 ) y_train = np.asarray([prep.binarise_labels(y, labels) for y in y_train])
'vectorizer_method': 'NA' } pretty_print_dict(config) if os.path.exists('llda/models/'): print("Deleting previous models...") shutil.rmtree('llda/models/') su_make_dir('llda/models/') if os.path.exists('llda/results/'): print("Deleting previous results...") shutil.rmtree('llda/results/') su_make_dir('llda/results/') # ----------------------------- LOAD DATA ----------------------------------- # train, test = prep.prep_data_frames(selection, load_interactome=False) labels = get_labels_from_file('data/labels.tsv') n = len(labels) split_train = {l:0 for l in labels} for l in labels: split_train[l] = sum(train[l].values) split_test = {l:0 for l in labels} for l in labels: split_test[l] = sum(test[l].values) # Create the appropriate statistics container for the whole experiment. validation_stats = Statistics() testing_stats = Statistics() seeds = create_seeds(iterations)
ontologies.append('cc') if bp: selection.append(go_sel + '_bp') ontologies.append('bp') if mf: selection.append(go_sel + '_mf') ontologies.append('mf') if len(selection) == 0: print("Please select some features using the command line args. Use --help or -h for help.") sys.exit(1) print(selection) # ---------------------- THRESHOLD TESTING ---------------------------- # developement_df, _ = prep_data_frames(selection, load_interactome=False) thresholds = np.arange(0, 1.1, step=0.1) folds = list(DataFrameStratifiedKFold( n_splits=cv_folds, shuffle=True, random_state=None ).split(developement_df, y=developement_df['label'].values)) statistics = Statistics() params = sk_generate_params('lr', columns=None) labels = get_labels_from_file('data/labels.tsv') seeds = create_seeds(len(labels)) things = {} def pr_curve(i): label = labels[i] statistics_l = Statistics() print('Doing label {}'.format(label))
label_col=y, continuous_cols=None ) return estimator.predict_proba(x_numpy) if __name__ == "__main__": method = sys.argv[1] outfile = sys.argv[2] columns = ['go_cc', 'go_mf', 'go_bp'] labels = get_labels_from_file('data/labels.tsv') uniprot = UniProt(sprot_cache=SPROT_FILE, trembl_cache=TREMBL_FILE) dag = load_go_dag('data/gene_ontology.1_2.obo') # Load the training data. train, test, interactome_df = prep_data_frames(selection=columns, load_interactome=True) training_df = pd.concat([train, test], ignore_index=True) training_corpus_pf = compute_corpus(training_df, ['pfam']) training_corpus_ipr = compute_corpus(training_df, ['ipr']) training_corpus_bp = compute_corpus(training_df, ['induced_go_bp']) training_corpus_cc = compute_corpus(training_df, ['induced_go_cc']) training_corpus_mf = compute_corpus(training_df, ['induced_go_mf']) pina_corpus_pf = compute_corpus(interactome_df, ['pfam']) pina_corpus_ipr = compute_corpus(interactome_df, ['ipr']) pina_corpus_bp = compute_corpus(interactome_df, ['induced_go_bp']) pina_corpus_cc = compute_corpus(interactome_df, ['induced_go_cc']) pina_corpus_mf = compute_corpus(interactome_df, ['induced_go_mf']) mean, std = depths(interactome_df, 'terms')