def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    ensembl_data = kg2_util.load_json(input_file_name)
    nodes = []
    edges = []
    genebuild_str = ensembl_data['genebuild']
    update_date = genebuild_str.split('/')[1]
    gene_ctr = 0

    ontology_curie_id = ENSEMBL_KB_CURIE_ID
    ens_kp_node = kg2_util.make_node(ontology_curie_id, ENSEMBL_KB_URI,
                                     'Ensembl Genes',
                                     kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                                     update_date, ontology_curie_id)
    nodes.append(ens_kp_node)

    for gene_dict in ensembl_data['genes']:
        gene_ctr += 1
        if test_mode and gene_ctr > 10000:
            break
        ensembl_gene_id = gene_dict['id']
        description = gene_dict.get('description', None)
        gene_symbol = gene_dict.get('name', None)
        other_synonyms = []
        xrefs = gene_dict.get('xrefs', None)
        if xrefs is not None:
            other_synonyms = list(
                set([
                    xref['primary_id'] for xref in xrefs
                    if xref['primary_id'] != ensembl_gene_id
                ]))
        node_dict = make_node(ensembl_gene_id, description, gene_symbol,
                              update_date, other_synonyms)
        nodes.append(node_dict)
        ensembl_gene_curie_id = node_dict['id']
        taxon_id_int = gene_dict.get('taxon_id', None)
        assert taxon_id_int == 9606, "unexpected taxon ID"
        edges.append(
            kg2_util.make_edge_biolink(
                ensembl_gene_curie_id,
                kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + str(taxon_id_int),
                kg2_util.EDGE_LABEL_BIOLINK_IN_TAXON, ENSEMBL_KB_CURIE_ID,
                update_date))
        hgnc_list = gene_dict.get('HGNC', None)
        if hgnc_list is not None:
            for hgnc_curie in hgnc_list:
                edges.append(
                    kg2_util.make_edge(ensembl_gene_curie_id, hgnc_curie,
                                       kg2_util.CURIE_ID_OWL_SAME_AS,
                                       kg2_util.EDGE_LABEL_OWL_SAME_AS,
                                       ENSEMBL_KB_CURIE_ID, update_date))
    return {'nodes': nodes, 'edges': edges}
示例#2
0
def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    ensembl_data = kg2_util.load_json(input_file_name)
    nodes = []
    edges = []
    genebuild_str = ensembl_data['genebuild']
    update_date = genebuild_str.split('/')[1]
    gene_ctr = 0
    for gene_dict in ensembl_data['genes']:
        gene_ctr += 1
        if test_mode and gene_ctr > 10000:
            break
        ensembl_gene_id = gene_dict['id']
        description = gene_dict.get('description', None)
        gene_symbol = gene_dict.get('name', None)
        other_synonyms = []
        xrefs = gene_dict.get('xrefs', None)
        if xrefs is not None:
            other_synonyms = list(set([xref['primary_id'] for xref in xrefs if xref['primary_id'] != ensembl_gene_id]))
        node_dict = make_node(ensembl_gene_id,
                              description,
                              gene_symbol,
                              update_date,
                              other_synonyms)
        nodes.append(node_dict)
        ensembl_gene_curie_id = node_dict['id']
        taxon_id_int = gene_dict.get('taxon_id', None)
        assert taxon_id_int == 9606, "unexpected taxon ID"
        edges.append(kg2_util.make_edge(ensembl_gene_curie_id,
                                        'NCBITaxon:' + str(taxon_id_int),
                                        'gene_found_in_organism',
                                        ENSEMBL_KB_IRI,
                                        update_date))
        hgnc_list = gene_dict.get('HGNC', None)
        if hgnc_list is not None:
            for hgnc_curie in hgnc_list:
                edges.append(kg2_util.make_edge(ensembl_gene_curie_id,
                                                hgnc_curie,
                                                'xref',
                                                ENSEMBL_KB_IRI,
                                                update_date))
    return {'nodes': nodes,
            'edges': edges}
示例#3
0
import argparse
import kg2_util


def make_arg_parser():
    arg_parser = argparse.ArgumentParser(
        description=
        'sample_subgraph.py: sample a smaller subgraph of a KG in JSON format')
    arg_parser.add_argument('--test',
                            dest='test',
                            action="store_true",
                            default=False)
    arg_parser.add_argument('inputFile', type=str, nargs=1)
    arg_parser.add_argument('outputFile', type=str, nargs=1)
    return arg_parser


if __name__ == '__main__':
    args = make_arg_parser().parse_args()
    input_file_name = args.inputFile
    output_file_name = args.outputFile
    graph = kg2_util.load_json(input_file_name)
    nodes = [graph['nodes'][i] for i in range(0, len(graph['nodes']), 5)]
    nodes_id_set = set([node['id'] for node in nodes])
    edges = [
        edge for edge in graph['edges']
        if edge['subject'] in nodes_id_set and edge['object'] in nodes_id_set
    ]
    kg2_util.save_json({'nodes': nodes, 'edges': edges}, output_file_name)