Exemplo n.º 1
0
    def run(self, data_file: Optional[str] = None):
        rel_zip_file_name = os.path.join(self.input_base_dir,
                                         "relationships.zip")
        relationship_file_name = "relationships.tsv"
        gene_mapping_zip_file = os.path.join(self.input_base_dir,
                                             "pharmgkb_genes.zip")
        gene_mapping_file_name = "genes.tsv"
        drug_mapping_zip_file = os.path.join(self.input_base_dir,
                                             "pharmgkb_drugs.zip")
        drug_mapping_file_name = "drugs.tsv"

        #
        # file stuff
        #
        # get relationship file (what we are ingesting here)
        # TODO: unlink relationship_tempdir and gene_id_tempdir

        relationship_tempdir = tempfile.mkdtemp()
        relationship_file_path = os.path.join(relationship_tempdir,
                                              relationship_file_name)
        unzip_to_tempdir(rel_zip_file_name, relationship_tempdir)
        if not os.path.exists(relationship_file_path):
            raise PharmGKBFileError(
                "Can't find relationship file needed for ingest")

        # get mapping file for gene ids
        gene_id_tempdir = tempfile.mkdtemp()
        gene_mapping_file_path = os.path.join(gene_id_tempdir,
                                              gene_mapping_file_name)
        unzip_to_tempdir(gene_mapping_zip_file, gene_id_tempdir)
        if not os.path.exists(gene_mapping_file_path):
            raise PharmGKBFileError(
                "Can't find gene map file needed for ingest")
        self.gene_id_map = self.make_id_mapping_file(gene_mapping_file_path)

        # get mapping file for drug ids
        drug_id_tempdir = tempfile.mkdtemp()
        drug_mapping_file_path = os.path.join(drug_id_tempdir,
                                              drug_mapping_file_name)
        unzip_to_tempdir(drug_mapping_zip_file, drug_id_tempdir)

        if not os.path.exists(drug_mapping_file_path):
            raise PharmGKBFileError(
                "Can't find drug map file needed for ingest")
        self.drug_id_map = self.make_id_mapping_file(drug_mapping_file_path)

        #
        # read in and transform relationship.tsv
        #
        with open(relationship_file_path) as relationships, \
                open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge:
            # write headers (change default node/edge headers if necessary
            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            rel_header = parse_header(relationships.readline())
            for line in relationships:
                line_data = self.parse_pharmgkb_line(line, rel_header)

                if set(self.edge_of_interest) == \
                        set([line_data['Entity1_type'], line_data['Entity2_type']]):

                    #
                    # Make nodes for drug and chemical
                    #
                    for entity_id, entity_name, entity_type in [
                        [
                            line_data['Entity1_id'], line_data['Entity1_name'],
                            line_data['Entity1_type']
                        ],
                        [
                            line_data['Entity2_id'], line_data['Entity2_name'],
                            line_data['Entity2_type']
                        ]
                    ]:
                        if entity_type == 'Gene':
                            self.make_pharmgkb_gene_node(
                                fh=node,
                                this_id=entity_id,
                                name=entity_name,
                                biolink_type=self.gene_node_type)
                        elif entity_type == 'Chemical':
                            self.make_pharmgkb_chemical_node(
                                fh=node,
                                chem_id=entity_id,
                                name=entity_name,
                                biolink_type=self.drug_node_type)
                        else:
                            raise PharmKGBInvalidNodeType(
                                "Node type isn't gene or chemical!")

                    #
                    # Make edge
                    #
                    self.make_pharmgkb_edge(fh=edge, line_data=line_data)
 def test_parse_drug_central_line(self, key, value):
     header = parse_header(self.dti_fh.readline())
     line = self.dti_fh.readline()
     parsed = parse_drug_central_line(line, header)
     self.assertTrue(key in parsed)
     self.assertEqual(value, parsed[key])
Exemplo n.º 3
0
    def run(self,
            data_file: Optional[str] = None,
            species: str = "H**o sapiens") -> None:
        """Method is called and performs needed transformations to process the Drug
        Central data, additional information
        on this data can be found in the comment at the top of this script"""

        if data_file is None:
            data_file = "drug.target.interaction.tsv.gz"
        interactions_file = os.path.join(self.input_base_dir, data_file)
        os.makedirs(self.output_dir, exist_ok=True)
        drug_node_type = "biolink:Drug"
        uniprot_curie_prefix = "UniProtKB:"
        drug_curie_prefix = "DrugCentral:"
        protein_node_type = "biolink:Protein"
        drug_protein_edge_label = "biolink:molecularly_interacts_with"
        drug_protein_edge_relation = "RO:0002436"  # molecularly interacts with
        self.edge_header = [
            'subject', 'edge_label', 'object', 'relation', 'provided_by',
            'comment', 'type'
        ]

        with open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                gzip.open(interactions_file, 'rt') as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())

            seen_proteins: dict = defaultdict(int)
            seen_drugs: dict = defaultdict(int)

            for line in interactions:
                items_dict = parse_drug_central_line(line, header_items)

                if 'ORGANISM' not in items_dict or items_dict[
                        'ORGANISM'] != species:
                    continue

                # get protein ID
                try:
                    protein_dict = items_dict_to_protein_data_dict(items_dict)

                except ItemInDictNotFound:
                    # lines with no ACCESSION entry only contain drug info, no target
                    # info - not ingesting these
                    continue
                except ValueError:
                    logging.error("Value error while parsing line")
                    continue

                # get drug ID
                drug_id = drug_curie_prefix + get_item_by_priority(
                    items_dict, ['STRUCT_ID'])

                # Write drug node
                if drug_id not in seen_drugs:
                    write_node_edge_item(
                        fh=node,
                        header=self.node_header,
                        data=[
                            drug_id,
                            items_dict['DRUG_NAME'],
                            drug_node_type,
                            '',  # TDL (not applicable for drugs)
                            self.source_name
                        ])
                    seen_drugs[drug_id] += 1

                for key, (uniprot_id, name, tdl) in protein_dict.items():
                    protein_id = uniprot_curie_prefix + uniprot_id

                    if protein_id not in seen_proteins:
                        write_node_edge_item(fh=node,
                                             header=self.node_header,
                                             data=[
                                                 protein_id, name,
                                                 protein_node_type, tdl,
                                                 self.source_name
                                             ])
                        seen_proteins[protein_id] += 1

                    # WRITE EDGES
                    write_node_edge_item(
                        fh=edge,
                        header=self.edge_header,
                        data=[
                            drug_id, drug_protein_edge_label, protein_id,
                            drug_protein_edge_relation, self.source_name,
                            items_dict['ACT_COMMENT'], 'biolink:Association'
                        ])

        return None
Exemplo n.º 4
0
    def run(self,
            data_file: Optional[str] = None,
            species: str = "H**o sapiens") -> None:
        """Method is called and performs needed transformations to process the Drug
        Central data, additional information
        on this data can be found in the comment at the top of this script"""

        interactions_file = os.path.join(self.input_base_dir,
                                         "drug.target.interaction.tsv.gz")
        tclin_chem_zip_file = os.path.join(self.input_base_dir, "tcrd.zip")
        os.makedirs(self.output_dir, exist_ok=True)
        drug_node_type = "biolink:Drug"
        gene_curie_prefix = "UniProtKB:"
        drug_curie_prefix = "DrugCentral:"
        gene_node_type = "biolink:Gene"
        drug_gene_edge_label = "biolink:interacts_with"
        drug_gene_edge_relation = "RO:0002436"  # molecularly interacts with
        self.edge_header = [
            'subject', 'edge_label', 'object', 'relation', 'provided_by',
            'comment'
        ]

        # unzip tcrd.zip and get tchem and tclin filenames
        tempdir = tempfile.mkdtemp()
        (tclin_file,
         tchem_file) = unzip_and_get_tclin_tchem(tclin_chem_zip_file, tempdir)

        tclin_dict: dict = tsv_to_dict(tclin_file, 'uniprot')
        tchem_dict: dict = tsv_to_dict(tchem_file, 'uniprot')

        with open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                gzip.open(interactions_file, 'rt') as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())

            for line in interactions:
                items_dict = parse_drug_central_line(line, header_items)

                if 'ORGANISM' not in items_dict or items_dict[
                        'ORGANISM'] != species:
                    continue

                # get gene ID
                try:
                    gene_id_string = get_item_by_priority(
                        items_dict, ['ACCESSION'])
                    gene_ids = gene_id_string.split('|')
                except ItemInDictNotFound:
                    # lines with no ACCESSION entry only contain drug info, no target
                    # info - not ingesting these
                    continue

                # get drug ID
                drug_id = drug_curie_prefix + get_item_by_priority(
                    items_dict, ['STRUCT_ID'])

                # WRITE NODES
                # drug - ['id', 'name', 'category']
                write_node_edge_item(fh=node,
                                     header=self.node_header,
                                     data=[
                                         drug_id, items_dict['DRUG_NAME'],
                                         drug_node_type,
                                         str(False),
                                         str(False)
                                     ])

                for gene_id in gene_ids:
                    gene_id = gene_curie_prefix + gene_id
                    is_tclin = True if gene_ids[0] in tclin_dict else False
                    is_tchem = True if gene_ids[0] in tchem_dict else False

                    write_node_edge_item(fh=node,
                                         header=self.node_header,
                                         data=[
                                             gene_id, items_dict['GENE'],
                                             gene_node_type,
                                             str(is_tclin),
                                             str(is_tchem)
                                         ])

                    # WRITE EDGES
                    # ['subject', 'edge_label', 'object', 'relation', 'provided_by',
                    # 'comment']
                    write_node_edge_item(fh=edge,
                                         header=self.edge_header,
                                         data=[
                                             drug_id, drug_gene_edge_label,
                                             gene_id, drug_gene_edge_relation,
                                             self.source_name,
                                             items_dict['ACT_COMMENT']
                                         ])

        return None
Exemplo n.º 5
0
    def run(self) -> None:
        """Method is called and performs needed transformations to process the Drug Central data, additional information
     on this data can be found in the comment at the top of this script"""

        interactions_file = os.path.join(self.input_base_dir,
                                         "drug.target.interaction.tsv.gz")
        os.makedirs(self.output_dir, exist_ok=True)
        drug_node_type = "biolink:Drug"
        gene_node_type = "biolink:Gene"
        drug_gene_edge_label = "biolink:interacts_with"
        drug_gene_edge_relation = "RO:0002436"  # molecularly interacts with
        self.edge_header = [
            'subject', 'edge_label', 'object', 'relation', 'comment'
        ]

        with open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                gzip.open(interactions_file, 'rt') as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())

            for line in interactions:
                items_dict = parse_drug_central_line(line, header_items)

                # get gene ID
                try:
                    gene_id = get_item_by_priority(items_dict, ['ACCESSION'])
                except ItemInDictNotFound:
                    # lines with no ACCESSION entry only contain drug info, no target
                    # info - not ingesting these
                    logging.info(
                        "No gene information for this line:\n{}\nskipping".
                        format(line))
                    continue

                # get drug ID
                drug_id = get_item_by_priority(
                    items_dict,
                    ['ACT_SOURCE_URL', 'MOA_SOURCE_URL', 'DRUG_NAME'])

                # WRITE NODES
                # drug - ['id', 'name', 'category']
                write_node_edge_item(
                    fh=node,
                    header=self.node_header,
                    data=[drug_id, items_dict['DRUG_NAME'], drug_node_type])

                write_node_edge_item(
                    fh=node,
                    header=self.node_header,
                    data=[gene_id, items_dict['GENE'], gene_node_type])

                # WRITE EDGES
                # ['subject', 'edge_label', 'object', 'relation', 'comment']
                write_node_edge_item(fh=edge,
                                     header=self.edge_header,
                                     data=[
                                         drug_id, drug_gene_edge_label,
                                         gene_id, drug_gene_edge_relation,
                                         items_dict['ACT_COMMENT']
                                     ])

        return None