def gpi_to_gene_node_data(self, rec: dict) -> list:
        """given a parsed gpi entry, return a node that can be passed to
        write_node_edge_item()

        :param rec: record from gpi iterator
        :return: list of node items, one for each thing in self.node_header
        """
        # ['id', 'name', 'category', 'synonym', 'taxon']
        id: str = self._rec_to_id(rec)

        try:
            name_list = get_item_by_priority(rec, ['DB_Object_Name'])
            if name_list is not None and len(name_list) > 0:
                name = name_list[0]
            else:
                name = ''
        except (IndexError, ItemInDictNotFound):
            name = ''

        category = self.protein_node_type
        try:
            synonym = get_item_by_priority(rec, ['DB_Object_Synonym'])[0]
        except (IndexError, ItemInDictNotFound):
            synonym = ''
        taxon = get_item_by_priority(rec, ['Taxon'])
        return [id, name, category, synonym, taxon]
    def gpa_to_edge_data(self, rec: dict) -> list:
        """given a parsed gpa entry, return an edge with the annotations

        :param rec: record from gpa iterator
        :return:
        """
        subj: str = self._rec_to_id(rec)
        edge_label: str = get_item_by_priority(rec, ['Qualifier'])[0]
        obj: str = get_item_by_priority(rec, ['GO_ID'])
        try:
            relation: str = self.edge_label_to_RO_term[edge_label]
        except KeyError:
            relation = ''

        edge_data = [subj, self.edge_label_prefix + edge_label, obj, relation,
                     self.source_name]
        # all the others
        for key in ['DB:Reference', 'ECO_Evidence_code', 'With', 'Interacting_taxon_ID',
                    'Date', 'Assigned_by', 'Annotation_Extension',
                    'Annotation_Properties']:
            try:
                item = get_item_by_priority(rec, [key])
                if type(item) is list:
                    item = item[0]
            except (ItemInDictNotFound, IndexError):
                item = ''
            edge_data.append(item)
        return edge_data
Exemplo n.º 3
0
def items_dict_to_protein_data_dict(items_dict: dict) -> dict:
    """Given a parsed line from parse_drug_central_line, split up pipe-separated entries
    for several related proteins and their names and TDL info into separate protein
    entries

    :param items_dict: dictionary of data from a line, output by parse_drug_central_line
    :return: a dict with information about each protein
    """
    protein_ids_string = get_item_by_priority(items_dict, ['ACCESSION'])
    protein_ids = protein_ids_string.split('|')
    gene_name = get_item_by_priority(items_dict, ['GENE']).split('|')
    TDL_values = get_item_by_priority(items_dict, ['TDL']).split('|')

    if len(protein_ids) != len(gene_name):
        logging.warning(
            "Didn't get the same number of entries for protein_ids and gene_ids"
        )
        gene_name = [''] * len(protein_ids)

    if len(protein_ids) != len(TDL_values):
        # this happens - repeat TDL designation for all protein IDs
        TDL_values = TDL_values * len(protein_ids)

    protein_dict = defaultdict(list)
    for i in range(len(protein_ids)):
        protein_dict[protein_ids[i]] = [
            protein_ids[i], gene_name[i], TDL_values[i]
        ]
    return protein_dict
 def _rec_to_id(self, rec: dict) -> str:
     try:
         this_id: str = get_item_by_priority(rec, ['DB']) + ":" + \
                        get_item_by_priority(rec, ['DB_Object_ID'])
     except ItemInDictNotFound:
         logging.error("Can't make ID for record: %s", "\t".join(rec))
         this_id = ''
     return this_id
Exemplo n.º 5
0
    def gpi_to_gene_node_data(self, rec: dict) -> list:
        """given a parsed gpi entry, return a node that can be passed to
        write_node_edge_item()

        :param rec: record from gpi iterator
        :return: list of node items, one for each thing in self.node_header
        """
        # ['id', 'name', 'category', 'full_name', 'synonym', 'in_taxon', 'xrefs', 'provided_by']
        id: str = self._rec_to_id(rec)

        try:
            name_list = get_item_by_priority(rec, ['DB_Object_Name'])
            if name_list is not None and len(name_list) > 0:
                full_name = name_list[0]
                if len(name_list) > 1:
                    logging.warning(
                        "Found >1 DB_Object_Name in rec, using the first one")
            else:
                full_name = ''
        except (IndexError, ItemInDictNotFound):
            full_name = ''

        try:
            symbol_list = get_item_by_priority(rec, ['DB_Object_Symbol'])
            if symbol_list is not None and len(symbol_list) > 0:
                name = symbol_list[0]
                if len(symbol_list) > 1:
                    logging.warning(
                        "Found >1 DB_Object_Symbol in rec, using the first one"
                    )
            else:
                name = ''
        except (IndexError, ItemInDictNotFound):
            full_name = ''

        category = self.protein_node_type
        try:
            synonym = get_item_by_priority(rec, ['DB_Object_Synonym'])
        except (IndexError, ItemInDictNotFound):
            synonym = ''
        taxon = get_item_by_priority(rec, ['Taxon'])
        taxon = ":".join([self.ncbi_taxon_prefix, taxon.split(":")[1]])

        xrefs = ''
        try:
            if rec['DB_Object_ID'] == 'UniProtKB:P0DTD1-PRO_0000449623':
                pass
            xrefs = get_item_by_priority(rec, ['DB_Xref'])
            if isinstance(xrefs, list):
                xrefs = "|".join(xrefs)
        except (ItemInDictNotFound):
            pass

        return [
            id, name, category, full_name, synonym, taxon, xrefs,
            self.source_name
        ]
Exemplo n.º 6
0
 def get_gene_name(self, data: dict) -> str:
     gene_name = ""
     try:
         gene_names = get_item_by_priority(data, ['GENENAME'])
         gene_name = gene_names[0]
     except ItemInDictNotFound:
         logging.warning("Problem with UNIPROID for this target id  {}".format(data))
     return gene_name
Exemplo n.º 7
0
 def get_targ_type(self, data: dict) -> str:
     targ_type = ""
     try:
         targ_types = get_item_by_priority(data, ['TARGTYPE'])
         targ_type = targ_types[0]
     except ItemInDictNotFound:
         pass
     return targ_type
Exemplo n.º 8
0
    def make_preferred_drug_id(self, pharmgkb_id: str,
                               drug_id_map: dict,
                               preferred_ids: dict={'ChEBI:CHEBI': 'CHEBI',
                                                    'CHEMBL': 'CHEMBL',
                                                    'DrugBank': 'DRUGBANK',
                                                    'PubChem Compound:': 'PUBCHEM'},
                               pharmgkb_prefix: str='PHARMGKB') \
            -> str:
        """Given a drug id, convert it to a cross-referenced ID, in this order of
        preference:
         CHEBI > CHEMBL > DRUGBANK > PUBCHEM

        :param pharmgkb_id
        :param drug_id_map - map of pharmgkb ids to cross-referenced IDs
        :param preferred_ids - dict of preferred ids in desc order of preference
                'their string' -> 'canonical CURIE prefix'
                wow, they don't make this easy
        :param pharmgkb_prefix thing to prepend to pharmgkb id ('PHARMGKB')
        :return: preferred_id: preferred cross-referenced ID
        """
        preferred_id = pharmgkb_prefix + ":" + pharmgkb_id
        if pharmgkb_id in drug_id_map:
            if 'Cross-references' not in drug_id_map[pharmgkb_id]:
                logging.warning(
                    "Can't find 'Cross-references' item in drug_id_map! "
                    "Was it renamed?")
            elif not drug_id_map[pharmgkb_id]['Cross-references']:
                # 'Cross-references' is empty
                pass
            else:
                map_string = drug_id_map[pharmgkb_id]['Cross-references']

                # the following makes an atrocious string like
                # '"PREFIX1:1234', 'PREFIX2:3456'
                # into a dict I can pass to get_item_by_priority to look for preferred
                # ID
                these_cr_ids = map_string.split(",")
                these_cr_ids_dict: dict = defaultdict()
                for this_id in these_cr_ids:
                    this_id = re.sub(r'^"|"$', '', this_id)  # strip quotes
                    items = this_id.rpartition(':')
                    if len(items) >= 3:
                        these_cr_ids_dict[items[0]] = items[2]

                for pharmgkb_prefix, curie_prefix in preferred_ids.items():
                    try:
                        this_id = get_item_by_priority(these_cr_ids_dict,
                                                       [pharmgkb_prefix])
                        preferred_id = curie_prefix + ":" + this_id
                        break
                    except ItemInDictNotFound:
                        pass

        return preferred_id
Exemplo n.º 9
0
 def get_uniproid(self, data: dict, name_2_id_map: dict,
                  uniprot_curie_prefix: str) -> str:
     uniproid = ""
     try:
         uniproids = get_item_by_priority(data, ['UNIPROID'])
         uniproid = uniproids[0]
         # use uniprotkb accession if we can find it
         if uniproid in name_2_id_map:
             uniproid = uniprot_curie_prefix + name_2_id_map[uniproid]
     except ItemInDictNotFound:
         logging.warning(
             "Problem with UNIPROID for this target id {}".format(data))
     return uniproid
Exemplo n.º 10
0
    def write_hpo_node(self, fh: TextIO, id: str, data: dict,
                       node_type: str) -> None:
        # Try to get comments/def in case this is useful for ML
        try:
            comment_field = get_item_by_priority(data, ['comment'])
        except ItemInDictNotFound:
            comment_field = ""

        try:
            description = get_item_by_priority(data, ['def'])
        except ItemInDictNotFound:
            description = ""

        try:
            name_field = get_item_by_priority(data, ['name'])
        except ItemInDictNotFound:
            name_field = ""

        write_node_edge_item(
            fh=fh,
            header=self.node_header,
            data=[id, name_field, node_type, comment_field, description])
Exemplo n.º 11
0
    def get_uniproids(self, data: dict, name_2_id_map: dict,
                      uniprot_curie_prefix: str) -> List[str]:
        ids = []
        try:
            uniproid_struct = get_item_by_priority(data, ['UNIPROID'])
            uniprot_names = uniproid_struct[0]

            for this_name in uniprot_names:
                # use uniprotkb accession if we can find it
                if this_name in name_2_id_map:
                    ids.append(uniprot_curie_prefix + name_2_id_map[this_name])
        except ItemInDictNotFound:
            logging.warning("Problem with UNIPROID for this target id {}".format(data))
        return ids
Exemplo n.º 12
0
    def run(self) -> None:
        """Method is called and performs needed transformations to process the Drug Central data, additional information
     on this data can be found in the comment at the top of this script"""

        interactions_file = os.path.join(self.input_base_dir,
                                         "drug.target.interaction.tsv.gz")
        os.makedirs(self.output_dir, exist_ok=True)
        drug_node_type = "biolink:Drug"
        gene_node_type = "biolink:Gene"
        drug_gene_edge_label = "biolink:interacts_with"
        drug_gene_edge_relation = "RO:0002436"  # molecularly interacts with
        self.edge_header = [
            'subject', 'edge_label', 'object', 'relation', 'comment'
        ]

        with open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                gzip.open(interactions_file, 'rt') as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())

            for line in interactions:
                items_dict = parse_drug_central_line(line, header_items)

                # get gene ID
                try:
                    gene_id = get_item_by_priority(items_dict, ['ACCESSION'])
                except ItemInDictNotFound:
                    # lines with no ACCESSION entry only contain drug info, no target
                    # info - not ingesting these
                    logging.info(
                        "No gene information for this line:\n{}\nskipping".
                        format(line))
                    continue

                # get drug ID
                drug_id = get_item_by_priority(
                    items_dict,
                    ['ACT_SOURCE_URL', 'MOA_SOURCE_URL', 'DRUG_NAME'])

                # WRITE NODES
                # drug - ['id', 'name', 'category']
                write_node_edge_item(
                    fh=node,
                    header=self.node_header,
                    data=[drug_id, items_dict['DRUG_NAME'], drug_node_type])

                write_node_edge_item(
                    fh=node,
                    header=self.node_header,
                    data=[gene_id, items_dict['GENE'], gene_node_type])

                # WRITE EDGES
                # ['subject', 'edge_label', 'object', 'relation', 'comment']
                write_node_edge_item(fh=edge,
                                     header=self.edge_header,
                                     data=[
                                         drug_id, drug_gene_edge_label,
                                         gene_id, drug_gene_edge_relation,
                                         items_dict['ACT_COMMENT']
                                     ])

        return None
Exemplo n.º 13
0
    def run(self, data_file: str = None) -> None:
        """Method is called and performs needed transformations to process
        protein-protein interactions from the STRING DB data.

        Args:
            data_file: data file to parse

        Returns:
            None.

        """
        if not data_file:
            data_file = os.path.join(self.input_base_dir,
                                     "9606.protein.links.full.v11.0.txt.gz")
        os.makedirs(self.output_dir, exist_ok=True)
        protein_node_type = "biolink:Protein"
        edge_label = "biolink:interacts_with"
        self.node_header = compress_json.local_load("node_header.json")
        edge_core_header = compress_json.local_load("edge_core_header.json")
        edge_additional_headers = compress_json.local_load(
            "edge_additional_headers.json")

        self.edge_header = edge_core_header + edge_additional_headers
        relation = 'RO:0002434'
        seen_proteins: Set = set()
        seen_genes: Set = set()

        # Required to align the node edge header of the gene
        # with the default header
        extra_header = [""] * (len(edge_additional_headers) + 1)

        with open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                gzip.open(data_file, 'rt') as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())
            for line in interactions:
                items_dict = parse_stringdb_interactions(line, header_items)
                proteins = []
                for protein_name in ('protein1', 'protein2'):
                    protein = get_item_by_priority(items_dict, [protein_name])
                    protein = '.'.join(protein.split('.')[1:])
                    proteins.append(protein)
                    if protein in self.protein_gene_map:
                        gene = self.protein_gene_map[protein]
                        if gene not in seen_genes:
                            seen_genes.add(gene)
                            ensemble_gene = f"ENSEMBL:{gene}"
                            gene_informations = self.gene_info_map[
                                self.ensembl2ncbi_map[gene]]
                            write_node_edge_item(
                                fh=node,
                                header=self.node_header,
                                data=[
                                    ensemble_gene, gene_informations['symbol'],
                                    'biolink:Gene',
                                    gene_informations['description'],
                                    f"NCBIGene:{self.ensembl2ncbi_map[gene]}"
                                ])
                            write_node_edge_item(
                                fh=edge,
                                header=self.edge_header,
                                data=[
                                    ensemble_gene,
                                    "biolink:has_gene_product",
                                    protein,
                                    "RO:0002205",
                                    "NCBI",
                                ] + extra_header)

                        # write node data
                        if protein not in seen_proteins:
                            seen_proteins.add(protein)
                            write_node_edge_item(fh=node,
                                                 header=self.node_header,
                                                 data=[
                                                     f"ENSEMBL:{protein}", "",
                                                     protein_node_type, "", ""
                                                 ])

                # write edge data
                write_node_edge_item(
                    fh=edge,
                    header=self.edge_header,
                    data=[
                        proteins[0], edge_label, proteins[1], relation,
                        "STRING", items_dict['combined_score']
                    ] + [
                        items_dict.get(header, "")
                        for header in edge_additional_headers
                    ])
Exemplo n.º 14
0
    def run(self, data_file: Optional[str] = None) -> None:
        """Method is called and performs needed transformations to process
        protein-protein interactions from the STRING DB data.

        Args:
            data_file: data file to parse

        Returns:
            None.

        """
        if not data_file:
            data_file = os.path.join(self.input_base_dir,
                                     "9606.protein.links.full.v11.0.txt.gz")
        os.makedirs(self.output_dir, exist_ok=True)
        protein_node_type = "biolink:Protein"
        edge_label = "biolink:interacts_with"
        self.node_header = compress_json.local_load("node_header.json")
        edge_core_header = compress_json.local_load("edge_core_header.json")
        edge_additional_headers = compress_json.local_load(
            "edge_additional_headers.json")

        self.edge_header = edge_core_header + edge_additional_headers
        relation = 'RO:0002434'
        seen_proteins: Set = set()
        seen_genes: Set = set()

        # Required to align the node edge header of the gene
        # with the default header
        self.extra_header = [""] * (len(edge_additional_headers) + 1)

        # make string ENSP to Uniprot id mapping dict
        string_to_uniprot_id_map = uniprot_make_name_to_id_mapping(
            os.path.join(self.input_base_dir, UNIPROT_ID_MAPPING))

        with open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                gzip.open(data_file, 'rt') as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())
            for line in interactions:
                items_dict = parse_stringdb_interactions(line, header_items)
                proteins = []
                for protein_name in ('protein1', 'protein2'):
                    nat_string_id = get_item_by_priority(
                        items_dict, [protein_name])
                    protein = '.'.join(nat_string_id.split('.')[1:])
                    proteins.append(protein)

                    if protein in self.protein_gene_map:
                        gene = self.protein_gene_map[protein]
                        if gene not in seen_genes:
                            seen_genes.add(gene)
                            ensemble_gene = f"ENSEMBL:{gene}"
                            gene_informations = self.gene_info_map[
                                self.ensembl2ncbi_map[gene]]
                            write_node_edge_item(
                                fh=node,
                                header=self.node_header,
                                data=[
                                    ensemble_gene, gene_informations['symbol'],
                                    'biolink:Gene',
                                    gene_informations['description'],
                                    f"NCBIGene:{self.ensembl2ncbi_map[gene]}",
                                    self.source_name
                                ])
                            write_node_edge_item(
                                fh=edge,
                                header=self.edge_header,
                                data=[
                                    ensemble_gene, "biolink:has_gene_product",
                                    f"ENSEMBL:{protein}", "RO:0002205", "NCBI",
                                    ""
                                ] + self.extra_header)

                    # write node data
                    if protein not in seen_proteins:
                        seen_proteins.add(protein)

                        # if we have an equivalent Uniprot ID for this Ensembl protein
                        # ID make an xref edge, and a node for the Uniprot ID
                        uniprot_curie = ''
                        if protein in string_to_uniprot_id_map:
                            uniprot_curie = \
                                f"UniProtKB:{string_to_uniprot_id_map[protein]}"
                            uniprot_curie = collapse_uniprot_curie(
                                uniprot_curie)

                        write_node_edge_item(
                            fh=node,
                            header=self.node_header,
                            data=[
                                f"ENSEMBL:{protein}",
                                "",
                                protein_node_type,
                                "",
                                uniprot_curie,  # xref
                                self.source_name
                            ])

                # write edge data
                write_node_edge_item(
                    fh=edge,
                    header=self.edge_header,
                    data=[
                        f"ENSEMBL:{proteins[0]}", edge_label,
                        f"ENSEMBL:{proteins[1]}", relation, "STRING",
                        "biolink:Association", items_dict['combined_score']
                    ] + [
                        items_dict.get(header, "")
                        for header in edge_additional_headers
                    ])
Exemplo n.º 15
0
    def run(self, data_file: str = None) -> None:
        """Method is called and performs needed transformations to process
        protein-protein interactions from the STRING DB data.

        Args:
            data_file: data file to parse

        Returns:
            None.

        """
        if not data_file:
            data_file = os.path.join(self.input_base_dir,
                                     "9606.protein.links.full.v11.0.txt.gz")
        os.makedirs(self.output_dir, exist_ok=True)
        protein_node_type = "biolink:Protein"
        edge_label = "biolink:interacts_with"
        self.node_header = ['id', 'name', 'category', 'description', 'alias']
        edge_core_header = [
            'subject', 'edge_label', 'object', 'relation', 'provided_by',
            'combined_score'
        ]
        edge_additional_headers = [
            'neighborhood', 'neighborhood_transferred', 'fusion',
            'cooccurence', 'homology', 'coexpression',
            'coexpression_transferred', 'experiments',
            'experiments_transferred', 'database', 'database_transferred',
            'textmining', 'textmining_transferred'
        ]
        self.edge_header = edge_core_header + edge_additional_headers
        relation = 'RO:0002434'
        seen: List = []

        with open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                gzip.open(data_file, 'rt') as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())
            for line in interactions:
                items_dict = parse_stringdb_interactions(line, header_items)
                protein1 = get_item_by_priority(items_dict, ['protein1'])
                protein1 = '.'.join(protein1.split('.')[1:])
                if protein1 in self.protein_gene_map:
                    gene1 = self.protein_gene_map[protein1]
                else:
                    gene1 = None
                protein2 = get_item_by_priority(items_dict, ['protein2'])
                protein2 = '.'.join(protein2.split('.')[1:])
                if protein2 in self.protein_gene_map:
                    gene2 = self.protein_gene_map[protein2]
                else:
                    gene2 = None

                if gene1 and gene1 not in seen:
                    write_node_edge_item(
                        fh=node,
                        header=self.node_header,
                        data=[
                            f"ENSEMBL:{gene1}", self.gene_info_map[
                                self.ensembl2ncbi_map[gene1]]['symbol'],
                            'biolink:Gene', self.gene_info_map[
                                self.ensembl2ncbi_map[gene1]]['description'],
                            f"NCBIGene:{self.ensembl2ncbi_map[gene1]}"
                        ])
                    write_node_edge_item(
                        fh=edge,
                        header=self.edge_header,
                        data=[
                            f"ENSEMBL:{gene1}", "biolink:has_gene_product",
                            protein1, "RO:0002205", "NCBI", ""
                        ] + ["" for x in edge_additional_headers])
                    seen.append(gene1)

                if gene2 and gene2 not in seen:
                    write_node_edge_item(
                        fh=node,
                        header=self.node_header,
                        data=[
                            f"ENSEMBL:{gene2}", self.gene_info_map[
                                self.ensembl2ncbi_map[gene2]]['symbol'],
                            'biolink:Gene', self.gene_info_map[
                                self.ensembl2ncbi_map[gene2]]['description'],
                            f"NCBIGene:{self.ensembl2ncbi_map[gene2]}"
                        ])
                    write_node_edge_item(
                        fh=edge,
                        header=self.edge_header,
                        data=[
                            f"ENSEMBL:{gene2}", "biolink:has_gene_product",
                            protein2, "RO:0002205", "NCBI", ""
                        ] + ["" for x in edge_additional_headers])
                    seen.append(gene2)

                # write node data
                if protein1 not in seen:
                    write_node_edge_item(fh=node,
                                         header=self.node_header,
                                         data=[
                                             f"ENSEMBL:{protein1}", "",
                                             protein_node_type, "", ""
                                         ])

                if protein2 not in seen:
                    write_node_edge_item(fh=node,
                                         header=self.node_header,
                                         data=[
                                             f"ENSEMBL:{protein2}", "",
                                             protein_node_type, "", ""
                                         ])
                seen.append(protein1)
                seen.append(protein2)

                # write edge data
                edge_data = [
                    protein1, edge_label, protein2, relation, "STRING",
                    items_dict['combined_score']
                ]
                for x in edge_additional_headers:
                    edge_data.append(items_dict[x] if x in items_dict else "")

                write_node_edge_item(fh=edge,
                                     header=self.edge_header,
                                     data=edge_data)
Exemplo n.º 16
0
    def run(self,
            data_file: Optional[str] = None,
            species: str = "H**o sapiens") -> None:
        """Method is called and performs needed transformations to process the Drug
        Central data, additional information
        on this data can be found in the comment at the top of this script"""

        interactions_file = os.path.join(self.input_base_dir,
                                         "drug.target.interaction.tsv.gz")
        tclin_chem_zip_file = os.path.join(self.input_base_dir, "tcrd.zip")
        os.makedirs(self.output_dir, exist_ok=True)
        drug_node_type = "biolink:Drug"
        gene_curie_prefix = "UniProtKB:"
        drug_curie_prefix = "DrugCentral:"
        gene_node_type = "biolink:Gene"
        drug_gene_edge_label = "biolink:interacts_with"
        drug_gene_edge_relation = "RO:0002436"  # molecularly interacts with
        self.edge_header = [
            'subject', 'edge_label', 'object', 'relation', 'provided_by',
            'comment'
        ]

        # unzip tcrd.zip and get tchem and tclin filenames
        tempdir = tempfile.mkdtemp()
        (tclin_file,
         tchem_file) = unzip_and_get_tclin_tchem(tclin_chem_zip_file, tempdir)

        tclin_dict: dict = tsv_to_dict(tclin_file, 'uniprot')
        tchem_dict: dict = tsv_to_dict(tchem_file, 'uniprot')

        with open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                gzip.open(interactions_file, 'rt') as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())

            for line in interactions:
                items_dict = parse_drug_central_line(line, header_items)

                if 'ORGANISM' not in items_dict or items_dict[
                        'ORGANISM'] != species:
                    continue

                # get gene ID
                try:
                    gene_id_string = get_item_by_priority(
                        items_dict, ['ACCESSION'])
                    gene_ids = gene_id_string.split('|')
                except ItemInDictNotFound:
                    # lines with no ACCESSION entry only contain drug info, no target
                    # info - not ingesting these
                    continue

                # get drug ID
                drug_id = drug_curie_prefix + get_item_by_priority(
                    items_dict, ['STRUCT_ID'])

                # WRITE NODES
                # drug - ['id', 'name', 'category']
                write_node_edge_item(fh=node,
                                     header=self.node_header,
                                     data=[
                                         drug_id, items_dict['DRUG_NAME'],
                                         drug_node_type,
                                         str(False),
                                         str(False)
                                     ])

                for gene_id in gene_ids:
                    gene_id = gene_curie_prefix + gene_id
                    is_tclin = True if gene_ids[0] in tclin_dict else False
                    is_tchem = True if gene_ids[0] in tchem_dict else False

                    write_node_edge_item(fh=node,
                                         header=self.node_header,
                                         data=[
                                             gene_id, items_dict['GENE'],
                                             gene_node_type,
                                             str(is_tclin),
                                             str(is_tchem)
                                         ])

                    # WRITE EDGES
                    # ['subject', 'edge_label', 'object', 'relation', 'provided_by',
                    # 'comment']
                    write_node_edge_item(fh=edge,
                                         header=self.edge_header,
                                         data=[
                                             drug_id, drug_gene_edge_label,
                                             gene_id, drug_gene_edge_relation,
                                             self.source_name,
                                             items_dict['ACT_COMMENT']
                                         ])

        return None
Exemplo n.º 17
0
    def run(self,
            data_file: Optional[str] = None,
            species: str = "H**o sapiens") -> None:
        """Method is called and performs needed transformations to process the Drug
        Central data, additional information
        on this data can be found in the comment at the top of this script"""

        if data_file is None:
            data_file = "drug.target.interaction.tsv.gz"
        interactions_file = os.path.join(self.input_base_dir, data_file)
        os.makedirs(self.output_dir, exist_ok=True)
        drug_node_type = "biolink:Drug"
        uniprot_curie_prefix = "UniProtKB:"
        drug_curie_prefix = "DrugCentral:"
        protein_node_type = "biolink:Protein"
        drug_protein_edge_label = "biolink:molecularly_interacts_with"
        drug_protein_edge_relation = "RO:0002436"  # molecularly interacts with
        self.edge_header = [
            'subject', 'edge_label', 'object', 'relation', 'provided_by',
            'comment', 'type'
        ]

        with open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge, \
                gzip.open(interactions_file, 'rt') as interactions:

            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            header_items = parse_header(interactions.readline())

            seen_proteins: dict = defaultdict(int)
            seen_drugs: dict = defaultdict(int)

            for line in interactions:
                items_dict = parse_drug_central_line(line, header_items)

                if 'ORGANISM' not in items_dict or items_dict[
                        'ORGANISM'] != species:
                    continue

                # get protein ID
                try:
                    protein_dict = items_dict_to_protein_data_dict(items_dict)

                except ItemInDictNotFound:
                    # lines with no ACCESSION entry only contain drug info, no target
                    # info - not ingesting these
                    continue
                except ValueError:
                    logging.error("Value error while parsing line")
                    continue

                # get drug ID
                drug_id = drug_curie_prefix + get_item_by_priority(
                    items_dict, ['STRUCT_ID'])

                # Write drug node
                if drug_id not in seen_drugs:
                    write_node_edge_item(
                        fh=node,
                        header=self.node_header,
                        data=[
                            drug_id,
                            items_dict['DRUG_NAME'],
                            drug_node_type,
                            '',  # TDL (not applicable for drugs)
                            self.source_name
                        ])
                    seen_drugs[drug_id] += 1

                for key, (uniprot_id, name, tdl) in protein_dict.items():
                    protein_id = uniprot_curie_prefix + uniprot_id

                    if protein_id not in seen_proteins:
                        write_node_edge_item(fh=node,
                                             header=self.node_header,
                                             data=[
                                                 protein_id, name,
                                                 protein_node_type, tdl,
                                                 self.source_name
                                             ])
                        seen_proteins[protein_id] += 1

                    # WRITE EDGES
                    write_node_edge_item(
                        fh=edge,
                        header=self.edge_header,
                        data=[
                            drug_id, drug_protein_edge_label, protein_id,
                            drug_protein_edge_relation, self.source_name,
                            items_dict['ACT_COMMENT'], 'biolink:Association'
                        ])

        return None