Пример #1
0
import sys
import predict.preprocess as prep


if __name__ == "__main__":
    if len(sys.argv) < 5:
        print("Usage: python <Input predictions file> <Output results file> <Protein/Gene list> <Probability threshold>")
        sys.exit(0)

    interactome_file = sys.argv[1]
    out_file = sys.argv[2]
    protein_list = sys.argv[3]
    threshold = float(sys.argv[4])
    symbol_type = sys.argv[5]
    labels = sorted(prep.get_labels_from_file('data/labels.tsv'))
    train, _ = prep.prep_data_frames(selection=[])
    uniprots = list(train['uniprot'].values)
    use_protein = True

    try:
        interactome_fp = open(interactome_file, 'r')
    except IOError:
        print("Could not open supplied file {}.".format(interactome_file))
        sys.exit(0)

    try:
        protein_fp = open(protein_list, 'r')
        proteins = set()
        for line in protein_fp:
            xs = line.strip().upper()
            proteins.add(xs)
Пример #2
0
        'binary': binary,
        'balanced': balanced,
        'induce': induce,
        'iteration': iterations,
        'cv_folds': cv_folds,
        'selection': selection,
        'ontologies': ontologies,
        'vectorizer_method': vectorizer_method,
        'permuted': permute,
        'scale': scale
    }
    pretty_print_dict(config)

    # ----------------------------- LOAD DATA ----------------------------------- #
    np.random.seed(42)
    developement_df, testing_df = prep.prep_data_frames(selection, load_interactome=False)
    labels = get_labels_from_file('data/labels.tsv')

    n = len(labels)
    split_train = {l:0 for l in labels}
    for l in labels:
        split_train[l] = sum(developement_df[l].values)

    split_test = {l:0 for l in labels}
    for l in labels:
        split_test[l] = sum(testing_df[l].values)

    n_samples_train = len(developement_df)
    n_samples_test = len(testing_df)

    # Create the appropriate statistics container for the whole experiment.
Пример #3
0
    print("Loading datasets...")
    df_kegg = pickle.load(open("tmp/train_df.pkl", 'r'))
    df_hprd = pickle.load(open("tmp/test_df.pkl", 'r'))
    df_test = pickle.load(open("tmp/interactome_df.pkl", 'r'))
    selection = ['ipr', 'pfam', 'induced_go_cc', 'induced_go_mf', 'induced_go_bp']
    labels = prep.get_labels_from_file("data/labels.tsv")

    def get_selection(row):
        terms = []
        for col in selection:
            terms += [row[col]]
        terms = [t for t in ','.join(terms).split(',') if t.strip() != '']
        return ','.join(terms)

    df_test['terms'] = df_test.apply(get_selection, axis=1)
    df_train, df_hprd = prep.prep_data_frames(selection)

    vectorizer = CountVectorizer(stop_words=['go', '', ' '], binary=False, lowercase=True)
    vectorizer.fit(df_train['terms'].values)

    print("Transforming features...")
    x_train, y_train, feature_names, selector = prep.select_features(
        df = df_train,
        vectorizer=vectorizer,
        feature_col='terms',
        label_col='label',
        continuous_col=['sim'],
        alpha=None,
        percentile=100
    )
    y_train = np.asarray([prep.binarise_labels(y, labels) for y in y_train])
Пример #4
0
        'vectorizer_method': 'NA'
    }
    pretty_print_dict(config)

    if os.path.exists('llda/models/'):
        print("Deleting previous models...")
        shutil.rmtree('llda/models/')
    su_make_dir('llda/models/')

    if os.path.exists('llda/results/'):
        print("Deleting previous results...")
        shutil.rmtree('llda/results/')
    su_make_dir('llda/results/')

    # ----------------------------- LOAD DATA ----------------------------------- #
    train, test = prep.prep_data_frames(selection, load_interactome=False)
    labels = get_labels_from_file('data/labels.tsv')
    n = len(labels)

    split_train = {l:0 for l in labels}
    for l in labels:
        split_train[l] = sum(train[l].values)

    split_test = {l:0 for l in labels}
    for l in labels:
        split_test[l] = sum(test[l].values)

    # Create the appropriate statistics container for the whole experiment.
    validation_stats = Statistics()
    testing_stats = Statistics()
    seeds = create_seeds(iterations)
Пример #5
0
            ontologies.append('cc')
        if bp:
            selection.append(go_sel + '_bp')
            ontologies.append('bp')
        if mf:
            selection.append(go_sel + '_mf')
            ontologies.append('mf')

    if len(selection) == 0:
        print("Please select some features using the command line args. Use --help or -h for help.")
        sys.exit(1)
    print(selection)


    # ---------------------- THRESHOLD TESTING ---------------------------- #
    developement_df, _ = prep_data_frames(selection, load_interactome=False)
    thresholds = np.arange(0, 1.1, step=0.1)
    folds = list(DataFrameStratifiedKFold(
        n_splits=cv_folds, shuffle=True, random_state=None
    ).split(developement_df, y=developement_df['label'].values))
    statistics = Statistics()
    params = sk_generate_params('lr', columns=None)
    labels = get_labels_from_file('data/labels.tsv')
    seeds = create_seeds(len(labels))

    things = {}
    def pr_curve(i):
        label = labels[i]
        statistics_l = Statistics()
        print('Doing label {}'.format(label))
Пример #6
0
        label_col=y,
        continuous_cols=None
    )
    return estimator.predict_proba(x_numpy)


if __name__ == "__main__":
    method = sys.argv[1]
    outfile = sys.argv[2]
    columns = ['go_cc', 'go_mf', 'go_bp']
    labels = get_labels_from_file('data/labels.tsv')
    uniprot = UniProt(sprot_cache=SPROT_FILE, trembl_cache=TREMBL_FILE)
    dag = load_go_dag('data/gene_ontology.1_2.obo')

    # Load the training data.
    train, test, interactome_df = prep_data_frames(selection=columns, load_interactome=True)
    training_df = pd.concat([train, test], ignore_index=True)

    training_corpus_pf = compute_corpus(training_df, ['pfam'])
    training_corpus_ipr = compute_corpus(training_df, ['ipr'])
    training_corpus_bp = compute_corpus(training_df, ['induced_go_bp'])
    training_corpus_cc = compute_corpus(training_df, ['induced_go_cc'])
    training_corpus_mf = compute_corpus(training_df, ['induced_go_mf'])

    pina_corpus_pf = compute_corpus(interactome_df, ['pfam'])
    pina_corpus_ipr = compute_corpus(interactome_df, ['ipr'])
    pina_corpus_bp = compute_corpus(interactome_df, ['induced_go_bp'])
    pina_corpus_cc = compute_corpus(interactome_df, ['induced_go_cc'])
    pina_corpus_mf = compute_corpus(interactome_df, ['induced_go_mf'])

    mean, std = depths(interactome_df, 'terms')