예제 #1
0
def main(data_directory):
    input_endpoint_path = data_directory / INPUT_ENDPOINT_FILE
    input_ontology_path = data_directory / INPUT_ONTOLOGY_FILE
    output_path = data_directory / OUTPUT_FILE
    prechecks(input_endpoint_path, input_ontology_path, output_path)

    endpoint_doids, endpoint_mesh = map_endpoint_doids_mesh(
        input_endpoint_path)
    all_endpoints = set(endpoint_doids)
    all_endpoints = all_endpoints.union(endpoint_mesh)

    logger.info("Parsing ontology file to map EFO->[]DOIDS")
    path = str(input_ontology_path)  # 'Ontology' takes only str as input
    ontology = Ontology(path)

    efo_doids = map_efo_doids(ontology)
    endpoint_efos = map_endpoint_efos(endpoint_doids, efo_doids)
    endpoint_refs = get_endpoint_refs(all_endpoints, ontology, endpoint_efos,
                                      endpoint_mesh)

    # Merge all the references into one coherent data structure
    out = merge(endpoint_doids, endpoint_mesh, endpoint_refs)

    logger.info(f"Writing endpoint refs to file {output_path}")
    with open(output_path, "x") as f:
        json.dump(out, f)

    logger.info("Done.")
예제 #2
0
파일: common.py 프로젝트: lifeomic/termlink
def _get_relationships(uri, system):
    """Parses a list of `Relationship` objects

    Args:
        uri:    a URI for the ontology file on the local filesystem
        system: the target system

    Returns:
        yields relationships
    """
    ontology = Ontology(uri.path)

    # child to parent relationships
    for term in ontology:
        for child in term.children:
            yield _to_relationship(child, "subsumes", term, system)

    # parent to child relationships
    for term in ontology:
        for parent in term.parents:
            yield _to_relationship(parent, "specializes", term, system)

    for term in ontology:
        for scope, references in term.other.items():
            if scope in _SCOPE_TO_EQUIVALENCE:
                for reference in references:
                    relationship = _to_equivalence_from_scope(scope)
                    yield _to_relationship(term, relationship,
                                           ontology[reference], system)
            if scope in _SCOPE_TO_INVERSE_SCOPE:
                inverse = _to_inverse_scope(scope)
                for reference in references:
                    relationship = _to_equivalence_from_scope(inverse)
                    yield _to_relationship(ontology[reference], relationship,
                                           term, system)
예제 #3
0
def convert_to_json(input="GFOP.owl", output="GFOP.json"):
    # create a root to bundle everything
    root: Node = Node("GFOP")
    nodes = {}

    # read owl file and cache all nodes in a dict{name, node}
    obo = Ontology(input)
    for term in obo.terms():
        id = term.id
        name = term.name
        # find parents in distance 1 (exclude self)
        parent_terms = term.superclasses(with_self=False, distance=1).to_set()
        if parent_terms is None or len(parent_terms) == 0:
            # create root node
            nodes[name] = Node(name, id=id)
        else:
            # currently only uses one parent
            parent = parent_terms.pop()
            nodes[name] = Node(name,
                               id=id,
                               parent_id=parent.id,
                               parent_name=parent.name)

    # link all nodes to their parents
    for key, node in nodes.items():
        if key is not root.name:
            try:
                # find parent in cached nodes and set to node
                node.parent = nodes[node.parent_name]
            except AttributeError:
                # no parent - add to root
                node.parent = root

    # generate json string
    exporter = JsonExporter(indent=2, sort_keys=True)
    json = exporter.export(root)

    # print json and tree for debugging
    print(json)

    for pre, _, node in RenderTree(root):
        print("%s%s" % (pre, node.name))

    # export to json file
    print("Writing to {}".format(output))
    with open(output, "w") as file:
        print(json, file=file)
예제 #4
0
    def validate(self, mzqc: MzQcFile):
        # Semantic validation of the JSON file.
        # Load the mzqc file specific ontologies
        cvs: Dict[str, TermList] = dict()
        for cv in mzqc.controlled_vocabularies:
            try:
                cvs[cv.ref] = Ontology(cv.uri, False) 
            except:
                SemanticError(f'Failed to load cv {cv.name} from {cv.uri}. Does {cv.ref} exist?')
        
        # For all cv terms involved:
        for cv_parameter in self._get_cv_parameters(mzqc):
            # Verify that cvRefs are valid.
            if cv_parameter.cvRef not in cvs.keys():
                raise SemanticError(f'Unknown CV reference <{cv_parameter.cv_ref}> in ' 
                                    f'element `{str(type(cv_parameter))}`')

            # Verify that the term exists in the CV.
            cv_term = cvs[cv_parameter.cvRef].get(cv_parameter.accession)
            if cv_term is None:
                raise SemanticError(f'Term {cv_parameter.name} not found in CV <{cv_parameter.cvRef}>')

            # Verify that the term name is correct.
            elif cv_parameter.name != cv_term.name:
                raise SemanticError(
                    f'Incorrect name for CV term {cv_parameter.accession}: '
                    f'"{cv_parameter.name}" != "{cv_term.name}"')

        # Regarding metadata, verify that input files are consistent and unique.
        self._inputFileConsistency(mzqc.run_qualities)
        self._inputFileConsistency(mzqc.set_qualities)

        # For all metrics (which are basing on cv type)
        #run_and_set_quality_collection: List[BaseQuality] = list()
        #for run_or_set_quality in run_and_set_quality_collection:
        if "Proteomics Standards Initiative Quality Control Ontology" not in [cv.name for cv in cvs.values()]:
            raise SemanticError(f'Quality Control Ontology missing!')
        else:
            keys = [filter( lambda x: cvs[x].name == "Proteomics Standards Initiative Quality Control Ontology", cvs )]
            if len(keys) != 1:
                SemanticError('More than one QC CV.')
            else:    
                qc_ref = keys[0]
            metric_cvs: List[Term] = cvs[qc_ref]["QC:4000001"].rchildren()
            
        for run_or_set_quality in chain(mzqc.run_qualities,mzqc.set_qualities):
            # Verify that quality metrics are unique within a run/setQuality.
            accessions: Set[str] = set()
            for quality_metric in run_or_set_quality.quality_metrics:
                if quality_metric.accession not in accessions:
                    accessions.add(quality_metric.accession)
                else:
                    raise ValidationError(f'Duplicate quality metric: '
                                          f'accession = {quality_metric.accession}')

                # Verify that quality_metric actually is of metric type/relationship?
                cv_term = cvs[quality_metric.cvRef].get(quality_metric.accession)
                if cv_term is None or cv_term not in metric_cvs:
                    raise SemanticError(f'Non-metric CV used in metric context.')
예제 #5
0
def _multiparse(multi_in, return_list):
    print('Parsing file: {}'.format(multi_in[0]))
    parser = multi_in[1][multi_in[0].split(os.path.extsep)[-1]]
    ont = multi_in[2][multi_in[0].split(os.path.extsep)[-1]]
    ont = Ontology("C:/Users/tnl495/appveyor-py-test/mzml2isa/psi-ms.obo", False)
    p = parser(multi_in[0], ont).meta_isa
    #p = parser(multi_in[0]).meta_isa
    #print p
    return_list.append(p)
    return p
예제 #6
0
def extract_concepts_from_ontologies(ontologies):
    # Extracts concepts from ontologies, given a list of ontologies
    all_concepts = []
    for ontology in ontologies:
        concepts = [
            split_by_camel_case(term)
            for term in Ontology(ontology).terms.keys()
        ]
        all_concepts.extend(concepts)
    return all_concepts
예제 #7
0
def get_term2id(
    ontology: Ontology
) -> typing.Tuple[typing.Dict[str, str], typing.List[str]]:
    terms = []
    term2id = {}
    for term in ontology.terms():
        if term.name:
            terms.append(term.name)
            term2id[term.name] = term.id
    return term2id, terms
예제 #8
0
def load_mondo_graph(lan='en', use_cache=True):
    fn_cache = os.path.join(PATH_CACHE, F'mondo-{lan}.gfx')
    if use_cache:
        if os.path.isfile(fn_cache):
            return MondoGraph.load(fn_cache)
    fn_obo = os.path.join(PATH_FILES, F'mondo-{lan}.obo')
    mondo = Ontology(fn_obo)
    gfx = MondoGraph(mondo)
    gfx.save(fn_cache)
    return gfx
예제 #9
0
def load_hpo_graph(lan='en', use_cache=True):
    fn_cache = os.path.join(PATH_CACHE, F'hp-{lan}.gfx')
    if use_cache:
        if os.path.isfile(fn_cache):
            return HPOGraph.load(fn_cache)
    fn_obo = os.path.join(PATH_FILES, F'hp-{lan}.obo')
    hpo = Ontology(fn_obo)
    gfx = HPOGraph(hpo)
    gfx.save(fn_cache)
    return gfx
예제 #10
0
def ncbi(args: argparse.Namespace) -> None:
    gff = GFF.parse(args.infile).break_bubbles()
    so = Ontology.from_obo_library(args.so)

    name_to_so = {term.name: term for term in so.values()}

    add_so_as_ontologies(gff, name_to_so)
    add_ncrna_types(gff, name_to_so, so, NCRNA_TYPES)
    add_pseudogene_types(gff, name_to_so, so, PSEUDOGENE_TYPES)

    return
예제 #11
0
def _multiparse(filepath, metalist, win):
    dirname = os.path.dirname(os.path.realpath(__file__))
    if not any(x in sys.argv for x in ('-h', '--help', '--version')):
        _ms = Ontology(os.path.join(dirname, "psi-ms.obo"), False)
        _ims = Ontology(os.path.join(dirname, "imagingMS.obo"), False)
        _ims.terms.update(_ms.terms)
    else:
        _ms, _ims = None, None
        _ims.merge(_ms)

    PARSERS = {'mzML': mzml.mzMLmeta, 'imzML': mzml.imzMLmeta}

    ONTOLOGIES = {'mzML': _ms, 'imzML': _ims}

    print('Parsing file: {}'.format(filepath))
    parser = PARSERS[filepath.split(os.path.extsep)[-1]]
    ont = ONTOLOGIES[filepath.split(os.path.extsep)[-1]]

    meta = parser(filepath, ont).meta

    metalist.append(meta)
 def expand_ontologies(self, to_strings=False):
     self = copy(self)
     for name, (ontology_path, id_mapper) in self.ontologies_data.items():
         ontology = Ontology(ontology_path)
         converter = (lambda entry: entry.name) if to_strings else identity
         self[name] = Series(
             tuple({
                 converter(ontology[id_mapper(ontology_id)])
                 for ontology_id in row.mapped_ontology_terms
                 if id_mapper(ontology_id) in ontology
             }) for row in self.itertuples()).values
     return MetaSRA(self)
예제 #13
0
def _multiparse(multi_in):
    #from mzml2isa import mzml
    #from pronto import Ontology

    print('Parsing file: {}'.format(multi_in[0]))
    parser = multi_in[1][multi_in[0].split(os.path.extsep)[-1]]
    #ont = multi_in[2][multi_in[0].split(os.path.extsep)[-1]]
    ont = Ontology("C:/Users/tnl495/appveyor-py-test/mzml2isa/psi-ms.obo",
                   False)
    p = parser(multi_in[0], ont).meta_isa
    #p = parser(multi_in[0]).meta_isa
    #print p
    return p
예제 #14
0
    def createSubsetFor(self, classes_in):
        self.classes_in = set(classes_in)
        print(len(classes_in))

        with open("classes_in.txt", 'w') as outfile:
            for c in self.classes_in:
                outfile.writelines(c + "\n")

        rw = robot_wrapper.RobotWrapper(
            robotcmd='/Users/hastingj/Work/Onto/robot/robot')

        #get_ontology_cmd = 'curl -L http://purl.obolibrary.org/obo/chebi.obo > chebi.obo'
        #rw.__executeCommand__(get_ontology_cmd)

        extract_cmd = [
            rw.robotcmd, "extract --method MIREOT ", "--input chebi.obo",
            "--lower-terms classes_in.txt", "--intermediates minimal",
            "--output chebi-slim.obo"
        ]

        rw.__executeCommand__(" ".join(extract_cmd))

        self.chebislim = Ontology("chebi-slim.obo")
예제 #15
0
    def get_ont(self):
        """
        Return an Ontology object that is ready to use.
        """
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")

            try:
                return Ontology(self.ontology_link)
            except ValueError:
                # Fail to parse
                raise exceptions.FailToParseOntologyException()
            except Exception:
                # from None suppresses all previous exceptions.
                raise exceptions.NoInternetConnectionException() from None
예제 #16
0
def ontologyExtractor(ontName):
    '''
        Extracts raw (property, entity) RDF triples from ontology given entity name
    '''

    ont = Ontology(ontName)
    ontology = []
    for term in list(ont.terms()):
        subclasses = list(term.subclasses())
        ontology.extend([(subclass.name, "subclass_of", term.name)
                         for subclass in subclasses])
    ontology = [
        tuple(el.split("\t")) for el in list(
            set([
                "\t".join(relation) for relation in ontology
                if relation[0] != relation[-1]
            ]))
    ]
    ontology = [
        ont for ont in ontology if ont[0] == "Pizza" or ont[-1] == "Pizza"
    ]
    ontology = [("hypernym", el[-1]) if el[0] == "Pizza" else
                ("hyponym", el[0] + " Pizza") for el in ontology]
    return ontology
예제 #17
0
def ontology_from_obo_library(ontology_short_name: str) -> Ontology:
    """parse an ontology

    This is taken directly from pronto Ontology right 
    now. Parse an OBO, JSON-graph, or OWL format ontology.
    
    Parameters
    ==========
    ontology_short_name: str 
        The short name from (cl.obo for cell line, ncit.obo for NCIT, etc.)
        
    Return
    ======
    An pronto Ontology object
    """
    ont = Ontology.from_obo_library(ontology_short_name)
    return ont
def extract_therapeutic_areas_from_owl() -> pd.DataFrame:
    """
    A dataframe with all the EFO IDs and their therapeutic areas are parsed from the EFO OTAR SLIM OWL file.
    """

    owl_url = fetch_otar_owl_from_github('latest')
    efo_terms = Ontology(owl_url, timeout=10).terms()
    owl_parsed = []

    for term in efo_terms:
        # The TAs are extracted by iterating through the ancestors of a term and looking up if it's in THERAPEUTIC_AREAS
        therapeutic_areas = []
        for ancestor in term.superclasses():
            ancestor_id = normalise_ontology_identifier(ancestor.id)
            if ancestor_id in THERAPEUTIC_AREAS['id']:
                therapeutic_areas.append(ancestor_id)

        efo_id = normalise_ontology_identifier(term.id)
        owl_parsed.append((efo_id, therapeutic_areas))

    return pd.DataFrame(owl_parsed, columns=['efo_id', 'therapeutic_areas'])
def extractHypernymsFromOntology(ontology):    
    ont = Ontology(ontology)
    allConcepts = []
    listid = []
    dictelem = {}
    for term in ont:
        allConcepts.append(term)
        if term.children:
            a = str(term).split(":")
            b = a[0]
            listid.append(b[1:])
    for x in range(0,len(listid)):
        key = listid[x]
        if key in dictelem:
            child = ont[listid[x].children].split(":")
            ch = child[0]
            dictelem.get(key).append(ch[1:])
        else:
            childs = ont[listid[x]].children
            all_childs = ""
            for y in childs:
                z = str(y).split(":")
                f = z[0]
                all_childs += f[1:]+","
            dictelem[key] = all_childs

    
    finalDict = {}

    for elem in dictelem:
        newelem = camel_case_split(elem)
        ls = dictelem[elem].split(",")[:-1]
        newval = ",".join([camel_case_split(el) for el in ls])
        finalDict[newelem] = newval

    hypernymsList = []
    for elem in finalDict:
        hypernymsList.extend([(elem, val) for val in finalDict[elem].split(",")])
    
    return (hypernymsList, allConcepts)
예제 #20
0
    def get_relationships(self):
        """Parses a list of `Relationship` objects

        Returns:
            yields `Relationship`s in JSON form
        """
        ontology = Ontology(self.uri.path)

        # child to parent relationships
        for term in ontology:
            for child in term.children:
                yield _to_relationship(child, "subsumes", term)

        # parent to child relationships
        for term in ontology:
            for parent in term.parents:
                yield _to_relationship(parent, "specializes", term)

        # alt_id relationships:
        if not self.skip_alt_ids:
            for term in ontology:
                for other, values in term.other.items():
                    if other == 'alt_id':
                        for value in values:
                            target = Term(id=value,
                                          name=term.name,
                                          desc=term.desc)
                            yield _to_relationship(term, "equal", target)

        # synonym relationships
        if not self.skip_synonyms:
            for term in ontology:
                for synonym in term.synonyms:
                    target = Term(id=term.id,
                                  name=synonym.desc,
                                  desc=term.desc)
                    equivalence = _to_equivalence_from_scope(synonym.scope)
                    yield _to_relationship(term, equivalence, target)
예제 #21
0
import numpy as np
import os
from pronto import Ontology
from collections import Counter
from transformers import AutoTokenizer, AutoModel
import torch
from scipy import spatial
import json

# File paths need to be set here.
# They are not passed via argument to the program.

_dev_path = "BioNLP-OST-2019_BB-norm_dev/"
_train_path = "BioNLP-OST-2019_BB-norm_train/"
_test_path = "BioNLP-OST-2019_BB-norm_test/"
_obo_base = Ontology("OntoBiotope_BioNLP-OST-2019.obo")

_train_set_output_folder = "train_results/"
_dev_set_output_folder = "dev_results/"
_test_set_output_folder = "test_results/"

# -2 at the end excludes LICENCE and README files.
_dev_files = sorted(list(os.walk(_dev_path))[0][2])[:-2]
_train_files = sorted(list(os.walk(_train_path))[0][2])[:-2]
_test_files = sorted(list(os.walk(_test_path))[0][2])[:-2]

###########################################################################
#                           DATA STRUCTURES                               #
###########################################################################

예제 #22
0
import mzml2isa
import mzml2isa.isa as isa
import mzml2isa.mzml as mzml
from mzml2isa.versionutils import longest_substring


_PARSERS = {'mzML': mzml.mzMLmeta,
           'imzML': mzml.imzMLmeta}

# change the ontology and start extracting imaging specific metadata
warnings.simplefilter('ignore')
dirname = os.path.dirname(os.path.realpath(__file__))

if not any(x in sys.argv for x in ('-h', '--help', '--version')):
    _ms = Ontology(os.path.join(dirname, "psi-ms.obo"), False)
    _ims = Ontology(os.path.join(dirname, "imagingMS.obo"), False)
    _ims.terms.update(_ms.terms)
else:
    _ms, _ims = None, None
#_ims.merge(_ms)


_ONTOLOGIES = {'mzML': _ms,
               'imzML': _ims }
del dirname


def _multiparse(filepath):
    print('Parsing file: {}'.format(filepath))
    parser = _PARSERS[filepath.split(os.path.extsep)[-1]]
예제 #23
0
import spacy
import string
import csv
from pronto import Ontology
onto_path = '/content/drive/My Drive/bio_files/OntoBiotope_BioNLP-OST-2019 (1).obo'
onto = Ontology(onto_path)


def remove_stopwords(sentence):
    non_stop_words = [word.text for word in sentence if not word.is_stop]
    return nlp(' '.join(non_stop_words))


def remove_non_ascii(text):
    return nlp(' '.join([
        token.text for token in text
        if all([letter in string.ascii_letters for letter in token.text])
    ]))


def load_entities(entities_loc):
    names = dict()
    descriptions = dict()
    test = []
    entities = []
    with open(entities_loc, 'r') as csvfile:
        csvreader = csv.reader(csvfile, delimiter="\t")

        for row in csvreader:
            qid = row[0]
            name = row[1]
예제 #24
0
efo_term2id = {}
with open('ontologies/efo.obo') as efo_file:
    efo_file = efo_file.read()
line_iterator = iter(efo_file.splitlines())
line = next(line_iterator, None)
while line is not None:
    if line == '[Term]':
        curr_id = next(line_iterator)[4:]
        if curr_id.startswith('EFO'):
            curr_name = next(line_iterator)[6:]
            efo_terms.append(curr_name)
            efo[curr_id] = curr_name
            efo_term2id[curr_name] = curr_id
    line = next(line_iterator, None)

cl = Ontology('ontologies/cl-basic.obo')
cl_term2id, cl_terms = get_term2id(cl)
cl_dict = {
    'ontology': cl,
    'terms': cl_terms,
    'terms_lower': [t.lower() for t in cl_terms],
    'term2id': cl_term2id,
    'column': 'cell_type'
}
uberon = Ontology('ontologies/uberon-basic.obo')
uberon_term2id, uberon_terms = get_term2id(uberon)
biomaterial2onto = {
    'primary cell': cl_dict,
    'primary cell culture': cl_dict,
    'primary tissue': {
        'ontology': uberon,
예제 #25
0
def to_atomese(owlfile):
    onto = Ontology(owlfile)
    with open(owlfile.split('.')[0] + '.scm', 'w') as output:
        import_terms(onto, output)
        output.write(import_meta(onto, owlfile.split('.')[0]))
예제 #26
0
def main():
    mondo = Ontology('mondo.obo')
    dic = extract_omim(mondo)
    with open('omim_index.json', 'w') as fp:
        json.dump(dic, fp, indent=2)
예제 #27
0
def full_parse(in_dir,
               out_dir,
               study_identifier,
               usermeta=None,
               split=True,
               merge=False,
               verbose=False,
               multip=False):
    """ Parses every study from *in_dir* and then creates ISA files.

    A new folder is created in the out directory bearing the name of
    the study identifier.

    :param str in_dir:           path to directory containing studies
    :param str out_dir:          path to out directory
    :param str study_identifier: name of the study (directory to create)
    """
    dirname = os.path.dirname(os.path.realpath(__file__))
    if not any(x in sys.argv for x in ('-h', '--help', '--version')):
        ms = Ontology(os.path.join(dirname, "psi-ms.obo"), False)
        ims = Ontology(os.path.join(dirname, "imagingMS.obo"), False)
        ims.terms.update(ms.terms)
    else:
        ms, ims = None, None
        ims.merge(ms)

    PARSERS = {'mzML': mzml.mzMLmeta, 'imzML': mzml.imzMLmeta}

    ONTOLOGIES = {'mzML': ms, 'imzML': ims}

    # get mzML file in the example_files folder
    if os.path.isfile(in_dir) and tarfile.is_tarfile(in_dir):
        compr = True
        mzml_files = compr_extract(in_dir, "tar")
    elif os.path.isfile(in_dir) and zipfile.is_zipfile(in_dir):
        compr = True
        mzml_files = compr_extract(in_dir, "zip")
    else:
        compr = False
        mzml_path = os.path.join(in_dir, "*mzML")

        if verbose:
            print(mzml_path)

        mzml_files = [mzML for mzML in glob.glob(mzml_path)]
        #mzml_files.sort()

    # if multip:
    #     pool = Pool(multip)

    manager = Manager()
    metalist = manager.list()

    if mzml_files:
        # store the first mzml_files extension
        if compr:
            ext1 = mzml_files[0].name.split(os.path.extsep)[-1]
        else:
            ext1 = mzml_files[0].split(os.path.extsep)[-1]

        if multip:
            jobs = []

            for i in mzml_files:
                p = Process(target=_multiparse, args=(i, metalist))
                jobs.append(p)
                p.start()

            for proc in jobs:
                proc.join()

        # get meta information for all files
        elif not verbose:
            pbar = pb.ProgressBar(widgets=[
                'Parsing {:8}: '.format(study_identifier),
                pb.FormatLabel('%(value)4d'), '/',
                '%4d' % len(mzml_files),
                pb.Bar(marker=MARKER, left=" |", right="| "),
                pb.ETA()
            ])

            for i in pbar(mzml_files):

                if compr:
                    ext = i.name.split(os.path.extsep)[-1]
                else:
                    ext = i.split(os.path.extsep)[-1]
                parser = PARSERS[ext]
                ont = ONTOLOGIES[ext]

                metalist.append(parser(i, ont).meta)

        else:
            for i in mzml_files:
                print("Parsing file: {}".format(i))

                if compr:
                    ext = i.name.split(os.path.extsep)[-1]
                else:
                    ext = i.split(os.path.extsep)[-1]

                parser = PARSERS[ext]
                ont = ONTOLOGIES[ext]

                metalist.append(parser(i, ont).meta)

        # update isa-tab file

        if merge and ext1 == 'imzML':
            if verbose:
                print('Attempting to merge profile and centroid scans')
            metalist = merge_spectra(metalist)

        if metalist:
            if verbose:
                print("Parsing mzML meta information into ISA-Tab structure")
            isa_tab_create = isa.ISA_Tab(out_dir, study_identifier, usermeta
                                         or {}).write(metalist, ext1, split)

    else:
        warnings.warn("No files were found in {}.".format(in_dir), UserWarning)
예제 #28
0
from pronto import Ontology

cl = Ontology("http://purl.obolibrary.org/obo/cl.obo")
#for ARGs (ARO for antibiotic resistance ontology--aro.owl from https://card.mcmaster.ca/download)

aro = Ontology.from_obo_library("aro.owl")
#exploring ontology
cf = aro['confers_resistance_to_antibiotic']
t = aro['ARO:1000001']
list(t.objects(cf))
list(t.superclasses())
list(t.subclasses())

#change format to obo
with open('aro.obo', 'wb') as f:
    aro.dump(f, format='obo')

#find terms
aro = Ontology("aro.obo")
for term in aro.terms():
    if term.is_leaf():
        print(term.id)

#load resfinder sequences for matching
import pandas as pd
resfinder_seq = pd.read_csv("resfinder.csv", sep=" ", header=None)
aro2seq = {}
for a in set(resfinder_seq['#Aminoglycoside']):
    if a not in aro:
        continue
    t = aro[a]
예제 #29
0
def build_vocabularies(dirname='datapackage'):
    edam_onto = Ontology('inputdata/edam.obo')
    # Hacked up Uberon so it will load
    uberon_onto = Ontology('inputdata/human-view.obo')
    obi_onto = Ontology('inputdata/obi.obo')

    fieldnames = ['id', 'name', 'description', 'synonyms']

    with open(dirname + '/data_type.tsv', 'w') as data_type_file:
        data_type_writer = csv.DictWriter(data_type_file,
                                          fieldnames=fieldnames,
                                          delimiter='\t')
        data_type_writer.writeheader()
        for v in set(i for i in edam_types['data_types'].values()
                     if i is not None):
            dt_id = 'http://edamontology.org/' + v.replace(':', '_')
            dt_term = edam_onto.get(dt_id)
            dt_name = dt_term.name
            dt_def = dt_term.definition
            syns = ''
            for s in dt_term.synonyms:
                syns += s.description + ''
            if not syns:
                syns = None
            else:
                syns = syns[:-1]

            data_type = {
                'id': v,
                'name': dt_name,
                'description': dt_def,
                # Synonyms in EDAM don't provide references
                #                 'synonyms': syns}
                'synonyms': None
            }
            data_type_writer.writerow(data_type)

    with open(dirname + '/file_format.tsv', 'w') as file_format_file:
        file_format_writer = csv.DictWriter(file_format_file,
                                            fieldnames=fieldnames,
                                            delimiter='\t')
        file_format_writer.writeheader()
        for v in set(i for i in edam_types['file_formats'].values()
                     if i is not None):
            ff_id = 'http://edamontology.org/' + v.replace(':', '_')
            ff_term = edam_onto.get(ff_id)
            ff_name = ff_term.name
            ff_def = ff_term.definition
            syns = ''
            for s in ff_term.synonyms:
                syns += s.description + '|'
            if not syns:
                syns = None
            else:
                syns = syns[:-1]
            file_format = {
                'id': v,
                'name': ff_name,
                'description': ff_def,
                # Synonyms in EDAM don't provide references
                #                 'synonyms': syns}
                'synonyms': None
            }
            file_format_writer.writerow(file_format)

    with open(dirname + '/assay_type.tsv', 'w') as assay_type_file:
        assay_type_writer = csv.DictWriter(assay_type_file,
                                           fieldnames=fieldnames,
                                           delimiter='\t')
        assay_type_writer.writeheader()
        for at_id in set(i for i in assay_types.values() if i is not None):
            at_term = obi_onto.get(at_id)
            at_name = at_term.name
            at_def = at_term.definition
            syns = ''
            for s in at_term.synonyms:
                syns += s.description + '|'
            if not syns:
                syns = None
            else:
                syns = syns[:-1]
            assay_type = {
                'id': at_id,
                'name': at_name,
                'description': at_def,
                # Synonyms semi-broken
                #                 'synonyms': syns}
                'synonyms': None
            }
            assay_type_writer.writerow(assay_type)

    with open(dirname + '/anatomy.tsv', 'w') as anatomy_file:
        anatomy_writer = csv.DictWriter(anatomy_file,
                                        fieldnames=fieldnames,
                                        delimiter='\t')
        anatomy_writer.writeheader()
        for an_id in set(i for i in anatomy_dict.values() if i is not None):
            an_term = uberon_onto.get(an_id)
            an_name = an_term.name
            an_def = an_term.definition
            syns = ''
            for s in an_term.synonyms:
                syns += s.description + '|'
            if not syns:
                syns = None
            else:
                syns = syns[:-1]
            anatomy = {
                'id': an_id,
                'name': an_name,
                'description': an_def,
                # Synonyms semi-broken
                #                 'synonyms': syns}
                'synonyms': None
            }
            anatomy_writer.writerow(anatomy)
예제 #30
0
import sqlite3
from pronto import Ontology
from Bio.SeqUtils import CheckSum
from Bio import SeqIO

faa_path = snakemake.input["faa_file"]
goa_path = snakemake.input["goa"]
go = Ontology(snakemake.input["go_obo"])
go_annotations = open(snakemake.output["go_annotations"], 'w')
uniparcdb = snakemake.input["uniparcdb"]

conn = sqlite3.connect(goa_path)
cursor = conn.cursor()

sqlatt = f'attach database "{uniparcdb}" as uniparc;'
cursor.execute(sqlatt,)

# 1. retrieve uniprot accession from exact match (hash)
# 2. retrieve GO annotations

for record in SeqIO.parse(faa_path, "fasta"):
    
    checksum = CheckSum.seguid(record.seq)
    

      
    sqlq = 'select * from uniparc.uniparc_accession where sequence_hash="%s"' % checksum
    
    uniparc_id = cursor.execute(sqlq,).fetchall()[0][0]
    
    print("uid", uniparc_id)