示例#1
0
def group_go_by_ontology(data, dag, fill_na, accession_to_feature_file):
    if isinstance(data, str):
        feature_df = load_data_frame(accession_to_feature_file, fill_na=fill_na)
        p_go = get_feature_for_accession(feature_df, data, 'uniprot', 'go')
    else:
        p_go = [t.upper() for t in data if 'go' in t.lower()]

    # Separate the namespaces in the go terms.
    p_go_cc = filter(lambda x: ontology.id_to_node(x, dag).namespace == 'cellular_component', p_go)
    p_go_bp = filter(lambda x: ontology.id_to_node(x, dag).namespace == 'biological_process', p_go)
    p_go_mf = filter(lambda x: ontology.id_to_node(x, dag).namespace == 'molecular_function', p_go)
    assert len(set(p_go_cc) & set(p_go_bp) & set(p_go_mf)) == 0
    return {'cc': p_go_cc, 'bp': p_go_bp, 'mf': p_go_mf}
示例#2
0
def compute_ss(ppi_tuples):
    r_file_in = tempfile.mktemp(suffix='.tsv', prefix='r_in_', dir='tmp')
    r_file_out = tempfile.mktemp(suffix='.tsv', prefix='r_out_', dir='tmp')
    dag = ontology.load_go_dag(OBO_FILE)
    feature_df = load_data_frame(ACCESSION_FEATURES_FILE, fill_na=np.NaN)

    # Write the three seperate GO columns to the r_input_file
    fp = open(r_file_in, 'w')
    fp.write("p1\tp2\tp1_go_cc\tp2_go_cc\tp1_go_bp\tp2_go_bp\tp1_go_mf\tp2_go_mf\n")
    for p1, p2 in ppi_tuples:
        p1_go = get_feature_for_accession(feature_df, p1, 'uniprot', 'go')
        p2_go = get_feature_for_accession(feature_df, p2, 'uniprot', 'go')

        # Separate the namespaces in the go terms.
        p1_go_cc = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'cellular_component', p1_go))
        for p in p1_go_cc:
            assert ontology.id_to_node(p, dag).namespace == 'cellular_component'

        p2_go_cc = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'cellular_component', p2_go))
        for p in p2_go_cc:
            assert ontology.id_to_node(p, dag).namespace == 'cellular_component'

        p1_go_bp = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'biological_process', p1_go))
        for p in p1_go_bp:
            assert ontology.id_to_node(p, dag).namespace == 'biological_process'

        p2_go_bp = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'biological_process', p2_go))
        for p in p2_go_bp:
            assert ontology.id_to_node(p, dag).namespace == 'biological_process'

        p1_go_mf = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'molecular_function', p1_go))
        for p in p1_go_mf:
            assert ontology.id_to_node(p, dag).namespace == 'molecular_function'
        p2_go_mf = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'molecular_function', p2_go))
        for p in p2_go_mf:
            assert ontology.id_to_node(p, dag).namespace == 'molecular_function'

        fp.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n".format(
                        p1, p2,
                        ','.join(p1_go_cc), ','.join(p2_go_cc),
                        ','.join(p1_go_bp), ','.join(p2_go_bp),
                        ','.join(p1_go_mf), ','.join(p2_go_mf)
                    )
                )
    fp.close()

    # Run R script then collect output from tmp file
    args = [
        'Rscript',
        'semantic_sim.r',
        '--file={}'.format(r_file_in),
        '--out={}'.format(r_file_out)
    ]
    proc = subprocess.Popen(args)
    proc.wait()

    # Parse the r_output into a list
    sims_tuple = []
    with open(r_file_out, 'r') as fp:
        for line in fp:
            xs = line.strip().split('\t')
            p1, p2, cc_ss, bp_ss, mf_ss = xs
            sims_tuple.append((p1, p2, cc_ss, bp_ss, mf_ss))
    fp.close()
    os.remove(r_file_in)
    os.remove(r_file_out)

    return sims_tuple