Exemplo n.º 1
0
def _get_term_from_tree(tree: ElementTree.ElementTree) -> Term:
    name = _find_text(tree, "name")
    description = _find_text(tree, "description")
    if description:
        description = description.strip().replace("\n", " ")
    identifier = _find_text(tree, "identifier")
    if identifier is None:
        raise ValueError
    term = Term(
        reference=Reference(PREFIX, identifier, name),
        definition=description,
    )
    for experiment in tree.findall("experiments"):
        experiment_name = _find_text(experiment, "name")
        experiment_identifier = _find_text(experiment, "identifier")
        if experiment_identifier is None:
            continue
        term.append_relationship(
            has_part,
            Reference(
                "gwascentral.experiment",
                identifier=experiment_identifier,
                name=experiment_name,
            ),
        )
    return term
Exemplo n.º 2
0
def iter_terms() -> Iterable[Term]:
    """Iterate over terms for KEGG Genome."""
    errors = 0
    for kegg_genome in iter_kegg_genomes():
        xrefs = []
        if kegg_genome.taxonomy_id is not None:
            taxonomy_name = pyobo.get_name('ncbitaxon', kegg_genome.taxonomy_id)
            if taxonomy_name is None:
                errors += 1
                tqdm.write(f'could not find name for taxonomy:{kegg_genome.taxonomy_id}')
            xrefs.append(Reference(
                prefix='ncbitaxon',
                identifier=kegg_genome.taxonomy_id,
                name=taxonomy_name,
            ))

        term = Term(
            reference=Reference(
                prefix='kegg.genome',
                identifier=kegg_genome.identifier,
                name=kegg_genome.name,
            ),
            xrefs=xrefs,
        )
        yield term

    logger.info('[%s] unable to find %d taxonomy names in NCBI', KEGG_GENOME_PREFIX, errors)
Exemplo n.º 3
0
def _iter_genome_terms(
    *,
    list_pathway_path: str,
    link_pathway_path: str,
    kegg_genome: KEGGGenome,
) -> Iterable[Term]:
    terms = {}
    with open(list_pathway_path) as file:
        list_pathway_lines = [line.strip() for line in file]
    for line in list_pathway_lines:
        line = line.strip()
        pathway_id, name = [part.strip() for part in line.split("\t")]
        pathway_id = pathway_id[len("path:"):]

        terms[pathway_id] = term = Term.from_triple(
            prefix=KEGG_PATHWAY_PREFIX,
            identifier=pathway_id,
            name=name,
        )

        # Annotate species information
        kegg_genome.annotate_term(term)

        # Annotate the non-species specific code
        _start = min(i for i, e in enumerate(pathway_id) if e.isnumeric())
        pathway_code = pathway_id[_start:]
        term.append_relationship(
            species_specific,
            Reference(prefix=KEGG_PATHWAY_PREFIX,
                      identifier=f"map{pathway_code}"),
        )

    for pathway_id, protein_ids in _get_link_pathway_map(
            link_pathway_path).items():
        term = terms.get(pathway_id)
        if term is None:
            tqdm.write(
                f"could not find kegg.pathway:{pathway_id} for {kegg_genome.name}"
            )
            continue
        for protein_id in protein_ids:
            term.append_relationship(
                has_part,
                Reference(
                    prefix=KEGG_GENES_PREFIX,
                    identifier=protein_id,
                ),
            )

    yield from terms.values()
Exemplo n.º 4
0
def get_terms(force: bool = False) -> Iterable[Term]:
    """Get terms."""
    # DDB ID	DDB_G ID	Name	UniProt ID
    uniprot_mappings = multisetdict(
        ensure_df(PREFIX, url=URL, force=force, name="uniprot_mappings.tsv", usecols=[1, 3]).values
    )

    terms = ensure_df(PREFIX, url=URL, force=force, name="gene_info.tsv")
    # GENE ID (DDB_G ID)	Gene Name	Synonyms	Gene products
    for identifier, name, synonyms, products in tqdm(terms.values):
        term = Term.from_triple(
            prefix=PREFIX,
            identifier=identifier,
            name=name,
        )
        if products and pd.notna(products) and products != "unknown":
            for synonym in products.split(","):
                term.append_synonym(synonym.strip())
        if synonyms and pd.notna(synonyms):
            for synonym in synonyms.split(","):
                term.append_synonym(Synonym(synonym.strip()))
        for uniprot_id in uniprot_mappings.get(identifier, []):
            if not uniprot_id or pd.isna(uniprot_id) or uniprot_id not in {"unknown", "pseudogene"}:
                continue
            term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id))

        term.set_species(identifier="44689", name="Dictyostelium discoideum")
        yield term
Exemplo n.º 5
0
def _get_xref_df(version: str) -> Mapping[str, List[Reference]]:
    base_url = f"https://raw.githubusercontent.com/sorgerlab/famplex/{version}"
    xrefs_url = f"{base_url}/equivalences.csv"
    xrefs_df = ensure_df(PREFIX,
                         url=xrefs_url,
                         version=version,
                         header=None,
                         sep=",",
                         dtype=str)

    # Normalize nextprot families
    ns_remapping = {
        "NXP": "nextprot.family",
    }
    xrefs_df[0] = xrefs_df[0].map(lambda s: ns_remapping.get(s, s))
    xrefs_df[1] = [
        xref_identifier
        if xref_prefix != "nextprot.family" else xref_identifier[len("FA:"):]
        for xref_prefix, xref_identifier in xrefs_df[[0, 1]].values
    ]

    xrefs_df[0] = xrefs_df[0].map(normalize_prefix)
    xrefs_df = xrefs_df[xrefs_df[0].notna()]
    xrefs_df = xrefs_df[xrefs_df[0] != "bel"]
    return multidict(
        (identifier, Reference(xref_prefix, xref_identifier))
        for xref_prefix, xref_identifier, identifier in xrefs_df.values)
Exemplo n.º 6
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over terms for KEGG Genome."""
    errors = 0
    for kegg_genome in iter_kegg_genomes(version=version, desc="KEGG Genomes"):
        if kegg_genome.identifier in SKIP:
            continue
        term = Term.from_triple(
            prefix=KEGG_GENOME_PREFIX,
            identifier=kegg_genome.identifier,
            name=kegg_genome.name,
        )
        if kegg_genome.taxonomy_id is not None:
            taxonomy_name = pyobo.get_name("ncbitaxon",
                                           kegg_genome.taxonomy_id)
            if taxonomy_name is None:
                errors += 1
                logger.debug(
                    f"[{KEGG_GENOME_PREFIX}] could not find name for taxonomy:{kegg_genome.taxonomy_id}"
                )
            term.append_xref(
                Reference(
                    prefix="ncbitaxon",
                    identifier=kegg_genome.taxonomy_id,
                    name=taxonomy_name,
                ))
        yield term

    logger.info("[%s] unable to find %d taxonomy names in NCBI",
                KEGG_GENOME_PREFIX, errors)
Exemplo n.º 7
0
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Iterate over terms from GWAS Central Phenotype."""
    for n in trange(1, 11000, desc=f"{PREFIX} download"):
        try:
            path = ensure_path(
                PREFIX,
                "phenotype",
                version=version,
                url=
                f"https://www.gwascentral.org/phenotype/HGVPM{n}?format=json",
                name=f"HGVPM{n}.json",
                force=force,
            )
        except OSError as e:
            tqdm.write(f"{n}: {e}")
            continue
        with open(path) as file:
            j = json.load(file)

        description = j.get("description")
        if description is not None:
            description = description.strip().replace("\n", " ")
        term = Term(
            reference=Reference(PREFIX, j["identifier"], j["name"]),
            definition=description,
        )
        yield term
Exemplo n.º 8
0
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Iterate over DrugCentral terms."""
    url = f"https://unmtid-shinyapps.net/download/DrugCentral/{version}/structures.smiles.tsv"
    df = ensure_df(PREFIX, url=url, version=version, force=force)
    for smiles, inchi, inchi_key, drugcentral_id, drugcentral_name, cas in df.values:
        if pd.isna(smiles) or pd.isna(inchi) or pd.isna(inchi_key):
            logger.warning("missing data for drugcentral:%s", drugcentral_id)
            continue
        term = Term.from_triple(prefix=PREFIX,
                                identifier=drugcentral_id,
                                name=drugcentral_name)
        term.append_xref(Reference(prefix="inchikey", identifier=inchi_key))
        term.append_property("smiles", smiles)
        term.append_property("inchi", inchi)
        if pd.notna(cas):
            term.append_xref(Reference(prefix="cas", identifier=cas))
        yield term
Exemplo n.º 9
0
    def test_extract_definition(self):
        """Test extracting a definition."""
        expected_text = "Test Text."

        for s, expected_references in [
            (f'"{expected_text}"', []),
            (f'"{expected_text}" []', []),
            (f'"{expected_text}" [PMID:1234]', [Reference('pubmed', '1234')]),
            (f'"{expected_text}" [PMID:1234, PMID:1235]',
             [Reference('pubmed', '1234'),
              Reference('pubmed', '1235')]),
        ]:
            with self.subTest(s=s):
                actual_text, actual_references = _extract_definition(
                    s, prefix='chebi', identifier='XXX')
                self.assertEqual(expected_text, actual_text)
                self.assertEqual(expected_references, actual_references)
Exemplo n.º 10
0
 def test_extract_definition_with_escapes(self):
     """Test extracting a definition with escapes in it."""
     expected_text = '''The canonical 3' splice site has the sequence "AG".'''
     s = '''"The canonical 3' splice site has the sequence \\"AG\\"." [PMID:1234]'''
     actual_text, actual_references = _extract_definition(s,
                                                          strict=True,
                                                          prefix='chebi',
                                                          identifier='XXX')
     self.assertEqual(expected_text, actual_text)
     self.assertEqual([Reference('pubmed', '1234')], actual_references)
Exemplo n.º 11
0
 def test_extract_definition_with_escapes(self):
     """Test extracting a definition with escapes in it."""
     expected_text = """The canonical 3' splice site has the sequence "AG"."""
     s = """"The canonical 3' splice site has the sequence \\"AG\\"." [PMID:1234]"""
     actual_text, actual_references = _extract_definition(s,
                                                          strict=True,
                                                          prefix="chebi",
                                                          identifier="XXX")
     self.assertEqual(expected_text, actual_text)
     self.assertEqual([Reference("pubmed", "1234")], actual_references)
Exemplo n.º 12
0
    def test_extract_definition(self):
        """Test extracting a definition."""
        expected_text = "Test Text."

        for s, expected_references in [
            (f'"{expected_text}"', []),
            (f'"{expected_text}" []', []),
            (f'"{expected_text}" [PMID:1234]', [Reference("pubmed", "1234")]),
            (
                f'"{expected_text}" [PMID:1234, PMID:1235]',
                [Reference("pubmed", "1234"),
                 Reference("pubmed", "1235")],
            ),
        ]:
            with self.subTest(s=s):
                actual_text, actual_references = _extract_definition(
                    s, prefix="chebi", identifier="XXX")
                self.assertEqual(expected_text, actual_text)
                self.assertEqual(expected_references, actual_references)
Exemplo n.º 13
0
def get_terms() -> Iterable[Term]:
    """Get ComplexPortal terms."""
    df = get_df()

    df['aliases'] = df['aliases'].map(lambda s: s.split('|')
                                      if pd.notna(s) else [])
    df['members'] = df['members'].map(_parse_members)
    df['xrefs'] = df['xrefs'].map(_parse_xrefs)

    taxnomy_id_to_name = get_id_name_mapping('ncbitaxon')
    df['taxonomy_name'] = df['taxonomy_id'].map(taxnomy_id_to_name.get)

    slim_df = df[[
        'complexportal_id',
        'name',
        'definition',
        'aliases',
        'xrefs',
        'taxonomy_id',
        'taxonomy_name',
        'members',
    ]]
    it = tqdm(slim_df.values,
              total=len(slim_df.index),
              desc=f'mapping {PREFIX}')
    unhandled_xref_type = set()
    for complexportal_id, name, definition, aliases, xrefs, taxonomy_id, taxonomy_name, members in it:
        synonyms = [Synonym(name=alias) for alias in aliases]
        _xrefs = []
        provenance = []
        for reference, note in xrefs:
            if note == 'identity':
                _xrefs.append(reference)
            elif note == 'see-also' and reference.prefix == 'pubmed':
                provenance.append(reference)
            elif (note, reference.prefix) not in unhandled_xref_type:
                logger.debug(
                    f'unhandled xref type: {note} / {reference.prefix}')
                unhandled_xref_type.add((note, reference.prefix))

        term = Term(
            reference=Reference(prefix=PREFIX,
                                identifier=complexportal_id,
                                name=name),
            definition=definition.strip(),
            synonyms=synonyms,
            xrefs=_xrefs,
            provenance=provenance,
        )
        term.set_species(identifier=taxonomy_id, name=taxonomy_name)

        for reference, _count in members:
            term.append_relationship(has_part, reference)

        yield term
Exemplo n.º 14
0
    def test_extract_synonym(self):
        """Test extracting synonym strings."""
        iupac_name = SynonymTypeDef(id="IUPAC_NAME", name="IUPAC NAME")
        synoynym_typedefs = {
            "IUPAC_NAME": iupac_name,
        }

        for synonym, s in [
            (
                Synonym(
                    name="LTEC I",
                    specificity="EXACT",
                    type=iupac_name,
                    provenance=[Reference("orphanet", "93938")],
                ),
                '"LTEC I" EXACT IUPAC_NAME [Orphanet:93938]',
            ),
            (
                Synonym(name="LTEC I",
                        specificity="EXACT",
                        provenance=[Reference("orphanet", "93938")]),
                '"LTEC I" EXACT [Orphanet:93938]',
            ),
            (
                Synonym(name="LTEC I",
                        specificity="EXACT",
                        provenance=[Reference("orphanet", "93938")]),
                '"LTEC I" [Orphanet:93938]',
            ),
            (
                Synonym(name="LTEC I", specificity="EXACT"),
                '"LTEC I" []',
            ),
        ]:
            with self.subTest(s=s):
                self.assertEqual(
                    synonym,
                    _extract_synonym(s,
                                     synoynym_typedefs,
                                     prefix="chebi",
                                     identifier="XXX"),
                )
Exemplo n.º 15
0
def _parse_xrefs(s) -> List[Tuple[Reference, str]]:
    if pd.isna(s):
        return []

    rv = []
    for xref in s.split('|'):
        entity_id, note = xref.split('(')
        note = note.rstrip(')')
        prefix, identifier = entity_id.split(':', 1)
        rv.append((Reference(prefix=prefix, identifier=identifier), note))
    return rv
Exemplo n.º 16
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over DrugCentral terms."""
    df = ensure_df(PREFIX, url=URL, version=version)
    for smiles, inchi, inchi_key, drugcentral_id, drugcentral_name, cas in df.values:
        if pd.isna(smiles) or pd.isna(inchi) or pd.isna(inchi_key):
            logger.warning("missing data for drugcentral:%s", drugcentral_id)
            continue
        xrefs = [
            Reference(prefix="smiles", identifier=smiles),
            Reference(prefix="inchi", identifier=inchi),
            Reference(prefix="inchikey", identifier=inchi_key),
        ]

        if pd.notna(cas):
            xrefs.append(Reference(prefix="cas", identifier=cas))

        yield Term(
            reference=Reference(prefix=PREFIX, identifier=drugcentral_id, name=drugcentral_name),
            xrefs=xrefs,
        )
Exemplo n.º 17
0
    def test_extract_synonym(self):
        """Test extracting synonym strings."""
        iupac_name = SynonymTypeDef(id='IUPAC_NAME', name='IUPAC NAME')
        synoynym_typedefs = {
            'IUPAC_NAME': iupac_name,
        }

        for synonym, s in [
            (
                Synonym(
                    name='LTEC I',
                    specificity='EXACT',
                    type=iupac_name,
                    provenance=[Reference('orphanet', '93938')],
                ),
                '"LTEC I" EXACT IUPAC_NAME [Orphanet:93938]',
            ),
            (
                Synonym(name='LTEC I',
                        specificity='EXACT',
                        provenance=[Reference('orphanet', '93938')]),
                '"LTEC I" EXACT [Orphanet:93938]',
            ),
            (
                Synonym(name='LTEC I',
                        specificity='EXACT',
                        provenance=[Reference('orphanet', '93938')]),
                '"LTEC I" [Orphanet:93938]',
            ),
            (
                Synonym(name='LTEC I', specificity='EXACT'),
                '"LTEC I" []',
            ),
        ]:
            with self.subTest(s=s):
                self.assertEqual(
                    synonym,
                    _extract_synonym(s,
                                     synoynym_typedefs,
                                     prefix='chebi',
                                     identifier='XXX'))
Exemplo n.º 18
0
def _get_term_from_tree(tree: ElementTree.ElementTree) -> Term:
    name = tree.find('name').text
    description = tree.find('description').text
    if description:
        description = description.strip().replace('\n', ' ')
    identifier = tree.find('identifier').text
    term = Term(
        reference=Reference(PREFIX, identifier, name),
        definition=description,
    )
    for experiment in tree.findall('experiments'):
        experiment_name = experiment.find('name').text
        experiment_id = experiment.find('identifier').text
        term.append_relationship(
            has_part,
            Reference(
                'gwascentral.experiment',
                identifier=experiment_id,
                name=experiment_name,
            ))
    return term
Exemplo n.º 19
0
def _get_term_from_tree(tree: ElementTree.ElementTree) -> Term:
    name = tree.find("name").text
    description = tree.find("description").text
    if description:
        description = description.strip().replace("\n", " ")
    identifier = tree.find("identifier").text
    term = Term(
        reference=Reference(PREFIX, identifier, name),
        definition=description,
    )
    for experiment in tree.findall("experiments"):
        experiment_name = experiment.find("name").text
        experiment_id = experiment.find("identifier").text
        term.append_relationship(
            has_part,
            Reference(
                "gwascentral.experiment",
                identifier=experiment_id,
                name=experiment_name,
            ),
        )
    return term
Exemplo n.º 20
0
def _parse_members(s) -> List[Tuple[Reference, str]]:
    if pd.isna(s):
        return []

    rv = []
    for member in s.split('|'):
        entity_id, count = member.split('(')
        count = count.rstrip(')')
        if ':' in entity_id:
            prefix, identifier = entity_id.split(':', 1)
        else:
            prefix, identifier = 'uniprot', entity_id
        rv.append((Reference(prefix=prefix, identifier=identifier), count))
    return rv
Exemplo n.º 21
0
def _parse_members(s) -> List[Tuple[Reference, str]]:
    if pd.isna(s):
        return []

    rv = []
    for member in s.split("|"):
        entity_id, count = member.split("(")
        count = count.rstrip(")")
        if ":" in entity_id:
            prefix, identifier = entity_id.split(":", 1)
        else:
            prefix, identifier = "uniprot", entity_id
        rv.append((Reference(prefix=prefix, identifier=identifier), count))
    return rv
Exemplo n.º 22
0
def get_terms(force: bool = False) -> Iterable[Term]:
    """Get CGNC terms."""
    df = ensure_df(PREFIX,
                   url=URL,
                   name=f"{PREFIX}.tsv",
                   force=force,
                   header=0,
                   names=HEADER)
    for i, (cgnc_id, entrez_id, ensembl_id, name, synonym_1, synoynm_2, _,
            _) in enumerate(df.values):
        if pd.isna(cgnc_id):
            logger.warning(f"row {i} CGNC ID is none")
            continue

        try:
            int(cgnc_id)
        except ValueError:
            logger.warning(f"row {i} CGNC ID is not int-like: {cgnc_id}")
            continue

        term = Term.from_triple(
            prefix=PREFIX,
            identifier=cgnc_id,
            name=name,
        )
        term.set_species(identifier="9031", name="Gallus gallus")
        if entrez_id and pd.notna(entrez_id):
            term.append_xref(Reference(prefix="ncbigene",
                                       identifier=entrez_id))
        if pd.notna(ensembl_id):
            term.append_xref(Reference(prefix="ensembl",
                                       identifier=ensembl_id))
        if synonym_1 and pd.notna(synonym_1):
            term.append_synonym(synonym_1)
        if synoynm_2 and pd.notna(synoynm_2):
            term.append_synonym(synoynm_2)
        yield term
Exemplo n.º 23
0
def iter_terms() -> Iterable[Term]:
    """Get ITIS terms."""
    zip_path = ensure_path(PREFIX, url=URL)
    version = _get_version()
    sqlite_dir = prefix_directory_join(PREFIX, version=version)
    sqlite_path = prefix_directory_join(PREFIX,
                                        name='ITIS.sqlite',
                                        version=version)
    if not os.path.exists(sqlite_path):
        with zipfile.ZipFile(zip_path) as zip_file:
            for x in zip_file.filelist:
                if x.filename.endswith('.sqlite'):
                    zip_file.extract(x, sqlite_dir)
                    shutil.move(
                        os.path.join(sqlite_dir, f'itisSqlite{version}',
                                     'ITIS.sqlite'), sqlite_path)
                    os.rmdir(os.path.join(sqlite_dir, f'itisSqlite{version}'))

    if not os.path.exists(sqlite_path):
        raise FileNotFoundError(f'file missing: {sqlite_path}')

    conn = sqlite3.connect(sqlite_path)

    with closing(conn.cursor()) as cursor:
        cursor.execute(LONGNAMES_QUERY)
        id_to_reference = {
            str(identifier): Reference(prefix=PREFIX,
                                       identifier=str(identifier),
                                       name=name)
            for identifier, name in cursor.fetchall()
        }

    with closing(conn.cursor()) as cursor:
        cursor.execute(HIERARCHY_QUERY)
        id_to_parents = multidict(
            (str(child), str(parent)) for child, parent in cursor.fetchall())

    for identifier, reference in id_to_reference.items():
        parents = []
        for parent_identifier in id_to_parents.get(identifier, []):
            if parent_identifier == '0':  # this means it's a plant
                continue
            parents.append(id_to_reference[parent_identifier])
        term = Term(
            reference=reference,
            parents=parents,
        )
        yield term
Exemplo n.º 24
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over ChEMBL compound's names."""
    op = get_path(version=version)
    logger.info('opening connection to %s', op)
    with closing(sqlite3.connect(op)) as conn:
        logger.info('using connection %s', conn)
        with closing(conn.cursor()) as cursor:
            logger.info('using cursor %s', cursor)
            cursor.execute(QUERY)
            for chembl_id, name in cursor.fetchall():
                # TODO add xrefs to smiles, inchi, inchikey here
                xrefs = []
                yield Term(
                    reference=Reference(prefix=PREFIX,
                                        identifier=chembl_id,
                                        name=name),
                    xrefs=xrefs,
                )
Exemplo n.º 25
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over ChEMBL compounds."""
    with chembl_downloader.connect(version=version) as conn:
        logger.info("using connection %s", conn)
        with closing(conn.cursor()) as cursor:
            logger.info("using cursor %s", cursor)
            cursor.execute(QUERY)
            for chembl_id, name, smiles, inchi, inchi_key in cursor.fetchall():
                # TODO add xrefs?
                term = Term.from_triple(prefix=PREFIX,
                                        identifier=chembl_id,
                                        name=name)
                if smiles:
                    term.append_property("smiles", smiles)
                if inchi:
                    term.append_property("inchi", inchi)
                if inchi_key:
                    term.append_xref(Reference("inchikey", inchi_key))
                yield term
Exemplo n.º 26
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over terms from GWAS Central Phenotype."""
    for n in trange(1, 11000, desc=f'{PREFIX} download'):
        try:
            path = ensure_path(
                PREFIX,
                'phenotype',
                version=version,
                url=f'https://www.gwascentral.org/phenotype/HGVPM{n}?format=json',
                name=f'HGVPM{n}.json',
            )
        except OSError as e:
            tqdm.write(f'{n}: {e}')
            continue
        with open(path) as file:
            j = json.load(file)
        term = Term(
            reference=Reference(PREFIX, j['identifier'], j['name']),
            definition=j['description'].strip().replace('\n', ' '),
        )
        yield term
Exemplo n.º 27
0
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
    """Get ITIS terms."""
    zip_path = ensure_path(PREFIX, url=URL, force=force, version=version)
    sqlite_dir = prefix_directory_join(PREFIX, version=version)
    sqlite_path = prefix_directory_join(PREFIX, name="itis.sqlite", version=version)
    if not os.path.exists(sqlite_path):
        with zipfile.ZipFile(zip_path) as zip_file:
            for file in zip_file.filelist:
                if file.filename.endswith(".sqlite") and not file.is_dir():
                    zip_file.extract(file, sqlite_dir)
                    shutil.move(os.path.join(sqlite_dir, file.filename), sqlite_path)
                    os.rmdir(os.path.join(sqlite_dir, os.path.dirname(file.filename)))

    if not os.path.exists(sqlite_path):
        raise FileNotFoundError(f"file missing: {sqlite_path}")

    conn = sqlite3.connect(sqlite_path.as_posix())

    with closing(conn.cursor()) as cursor:
        cursor.execute(LONGNAMES_QUERY)
        id_to_reference = {
            str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name)
            for identifier, name in cursor.fetchall()
        }

    with closing(conn.cursor()) as cursor:
        cursor.execute(HIERARCHY_QUERY)
        id_to_parents = multidict((str(child), str(parent)) for child, parent in cursor.fetchall())

    for identifier, reference in id_to_reference.items():
        parents = []
        for parent_identifier in id_to_parents.get(identifier, []):
            if parent_identifier == "0":  # this means it's a plant
                continue
            parents.append(id_to_reference[parent_identifier])
        term = Term(
            reference=reference,
            parents=parents,
        )
        yield term
Exemplo n.º 28
0
def iter_terms() -> Iterable[Term]:
    """Get ITIS terms."""
    zip_path = ensure_path(PREFIX, URL)
    sqlite_path = prefix_directory_join(PREFIX, 'itisSqlite043020',
                                        'ITIS.sqlite')
    if not os.path.exists(sqlite_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(get_prefix_directory(PREFIX))

    if not os.path.exists(sqlite_path):
        raise FileNotFoundError(f'file missing: {sqlite_path}')

    conn = sqlite3.connect(sqlite_path)

    with closing(conn.cursor()) as cursor:
        cursor.execute(LONGNAMES_QUERY)
        id_to_reference = {
            str(identifier): Reference(prefix=PREFIX,
                                       identifier=str(identifier),
                                       name=name)
            for identifier, name in cursor.fetchall()
        }

    with closing(conn.cursor()) as cursor:
        cursor.execute(HIERARCHY_QUERY)
        id_to_parents = multidict(
            (str(child), str(parent)) for child, parent in cursor.fetchall())

    for identifier, reference in id_to_reference.items():
        parents = []
        for parent_identifier in id_to_parents.get(identifier, []):
            if parent_identifier == '0':  # this means its a plant
                continue
            parents.append(id_to_reference[parent_identifier])
        term = Term(
            reference=reference,
            parents=parents,
        )
        yield term
Exemplo n.º 29
0
def _parse_xrefs(s) -> List[Tuple[Reference, str]]:
    if pd.isna(s):
        return []

    rv = []
    for xref in s.split("|"):
        xref = xref.replace("protein ontology:PR:", "PR:")
        xref = xref.replace("protein ontology:PR_", "PR:")
        try:
            xref_curie, note = xref.split("(")
        except ValueError:
            logger.warning("xref missing (: %s", xref)
            continue
        note = note.rstrip(")")
        try:
            reference = Reference.from_curie(xref_curie)
        except ValueError:
            logger.warning("can not parse CURIE: %s", xref)
            continue
        if reference is None:
            logger.warning("reference is None after parsing: %s", xref)
            continue
        rv.append((reference, note))
    return rv
Exemplo n.º 30
0
def _parse_xrefs(s) -> List[Tuple[Reference, str]]:
    if pd.isna(s):
        return []

    rv = []
    for xref in s.split('|'):
        xref = xref.replace('protein ontology:PR:', 'PR:')
        xref = xref.replace('protein ontology:PR_', 'PR:')
        try:
            xref_curie, note = xref.split('(')
        except ValueError:
            logger.warning('xref missing (: %s', xref)
            continue
        note = note.rstrip(')')
        try:
            reference = Reference.from_curie(xref_curie)
        except ValueError:
            logger.warning('can not parse CURIE: %s', xref)
            continue
        if reference is None:
            logger.warning('reference is None after parsing: %s', xref)
            continue
        rv.append((reference, note))
    return rv