Exemplo n.º 1
0
    def __init__(self):
        Source.__init__(self, 'mpd')
        # @N, not sure if this step is required
        self.namespaces.update(curie_map.get())
        self.stdevthreshold = 2

        self.nobnodes = True  # FIXME

        # update the dataset object with details about this resource
        # @N: Note that there is no license as far as I can tell
        self.dataset = Dataset(
            'mpd', 'MPD', 'http://phenome.jax.org', None, None)

        # TODO add a citation for mpd dataset as a whole
        self.dataset.set_citation('PMID:15619963')

        self.assayhash = {}
        self.idlabel_hash = {}
        # to store the mean/zscore of each measure by strain+sex
        self.score_means_by_measure = {}
        # to store the mean value for each measure by strain+sex
        self.strain_scores_by_measure = {}

        self.geno = Genotype(self.graph)
        self.gu = GraphUtils(curie_map.get())

        return
Exemplo n.º 2
0
    def load_bindings(self):
        self.load_core_bindings()
        for g in [self.graph, self.testgraph]:

            for k in self.namespaces.keys():
                v = self.namespaces[k]
                g.bind(k, Namespace(v))

            for k in curie_map.get().keys():
                v = curie_map.get()[k]
                g.bind(k, Namespace(v))
        return
Exemplo n.º 3
0
    def __init__(self):
        Source.__init__(self, 'ctd')
        self.dataset = Dataset(
            'ctd', 'CTD', 'http://ctdbase.org', None,
            'http://ctdbase.org/about/legal.jsp')

        if 'test_ids' not in config.get_config() \
                or 'gene' not in config.get_config()['test_ids']:
            logger.warning("not configured with gene test ids.")
            self.test_geneids = []
        else:
            self.test_geneids = config.get_config()['test_ids']['gene']

        if 'test_ids' not in config.get_config() \
                or 'disease' not in config.get_config()['test_ids']:
            logger.warning("not configured with disease test ids.")
            self.test_diseaseids = []
        else:
            self.test_diseaseids = config.get_config()['test_ids']['disease']

        self.gu = GraphUtils(curie_map.get())
        self.g = self.graph
        self.geno = Genotype(self.g)

        return
Exemplo n.º 4
0
    def _parse_curated_chem_disease(self, limit):
        line_counter = 0
        file_path = '/'.join((self.rawdir, self.static_files['publications']['file']))
        gu = GraphUtils(curie_map.get())
        with open(file_path, 'r') as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            for row in reader:
                # catch comment lines
                if re.match('^#', ' '.join(row)):
                    continue
                line_counter += 1
                self._check_list_len(row, 10)
                (pub_id, disease_label, disease_id, disease_cat, evidence,
                 chem_label, chem_id, cas_rn, gene_symbol, gene_acc) = row

                rel_id = self._get_relationship_id(evidence)
                chem_id = 'MESH:'+chem_id
                gu.addClassToGraph(self.g, chem_id, chem_label)
                gu.addClassToGraph(self.g, disease_id, None)
                if pub_id != '':
                    pub_id = 'PMID:'+pub_id
                    r = Reference(pub_id, Reference.ref_types['journal_article'])
                    r.addRefToGraph(self.g)
                else:
                    pub_id = None
                self._make_association('MESH:'+chem_id, disease_id, rel_id, ['PMID:'+pub_id])

                if not self.testMode and limit is not None and line_counter >= limit:
                    break
        return
Exemplo n.º 5
0
    def _process_collection(self, collection_id, label, page):
        """
        This function will process the data supplied internally
        about the repository from Coriell.

        Triples:
            Repository a ERO:collection
            rdf:label Literal(label)
            foaf:page Literal(page)

        :param collection_id:
        :param label:
        :param page:
        :return:
        """
        # #############    BUILD THE CELL LINE REPOSITORY    #############
        for g in [self.graph, self.testgraph]:
            # FIXME: How to devise a label for each repository?
            gu = GraphUtils(curie_map.get())
            repo_id = 'CoriellCollection:'+collection_id
            repo_label = label
            repo_page = page

            gu.addIndividualToGraph(
                g, repo_id, repo_label, self.terms['collection'])
            gu.addPage(g, repo_id, repo_page)

        return
Exemplo n.º 6
0
    def declareAsOntology(self, graph):
        """
        The file we output needs to be declared as an ontology, including it's version information.
        Further information will be augmented in the dataset object.
        :param version:
        :return:
        """
        # <http://data.monarchinitiative.org/ttl/biogrid.ttl> a owl:Ontology ;
        # owl:versionInfo <http://archive.monarchinitiative.org/ttl/biogrid-YYYY-MM-DD.ttl>

        gu = GraphUtils(curie_map.get())

        ontology_file_id = 'MonarchData:'+self.name+".ttl"
        gu.addOntologyDeclaration(graph, ontology_file_id)

        # add timestamp as version info

        t = datetime.now()
        t_string = t.strftime("%Y-%m-%d-%H-%M")
        ontology_version = self.name+'-'+t_string
        archive_url = 'MonarchArchive:'+ontology_version+'.ttl'
        gu.addOWLVersionIRI(graph, ontology_file_id, archive_url)
        gu.addOWLVersionInfo(graph, ontology_file_id, ontology_version)

        # TODO make sure this is synced with the Dataset class

        return
Exemplo n.º 7
0
    def _map_rel_id(orphanet_rel_id):
        # TODO check if these ids are stable for mapping
        rel_id = None
        gu = GraphUtils(curie_map.get())
        id_map = {
            "17949": gu.object_properties["has_phenotype"],  # Disease-causing germline mutation(s) in
            "17955": gu.object_properties["has_phenotype"],  # Disease-causing somatic mutation(s) in
            "17961": gu.object_properties["contributes_to"],  # Major susceptibility factor in
            "17967": gu.object_properties["contributes_to"],  # Modifying germline mutation in
            "17973": gu.object_properties["contributes_to"],  # Modifying somatic mutation in
            "17979": gu.object_properties["contributes_to"],  # Part of a fusion gene in
            "17985": gu.object_properties["contributes_to"],  # Role in the phenotype of
            "18273": None,  # Candidate gene tested in  FIXME?
            "25972": gu.object_properties[
                "has_phenotype"
            ],  # Disease-causing germline mutation(s) (loss of function) in
            "25979": gu.object_properties[
                "has_phenotype"
            ],  # Disease-causing germline mutation(s) (gain of function) in
        }

        if orphanet_rel_id in id_map:
            rel_id = id_map[orphanet_rel_id]
        else:
            logger.error("Disease-gene association type (%s) not mapped.", orphanet_rel_id)

        return rel_id
Exemplo n.º 8
0
    def _getNode(self, curie):
        """
        This is a wrapper for creating a URIRef or Bnode object
        with a given a curie or iri as a string.

        If an id starts with an underscore, it assigns it to a BNode, otherwise
        it creates it with a standard URIRef.
        Alternatively, self.skolemize_blank_node is True,
        it will skolemize the blank node

        :param curie: str identifier formatted as curie or iri
        :return: node: RDFLib URIRef or BNode object
        """
        node = None
        if re.match(r'^_', curie):
            if self.are_bnodes_skized is True:
                node = self.skolemizeBlankNode(curie)
            else:  # replace the leading underscore to make it cleaner
                node = BNode(re.sub(r'^_:|^_', '', curie, 1))
        # Check if curie actually an IRI
        elif re.match(r'^http|^ftp', curie):
            node = URIRef(curie)
        else:
            iri = RDFGraph.curie_util.get_uri(curie)
            if iri is not None:
                node = URIRef(RDFGraph.curie_util.get_uri(curie))
                # Bind prefix map to graph
                prefix = curie.split(':')[0]
                if prefix not in self.namespace_manager.namespaces():
                    mapped_iri = curie_map.get()[prefix]
                    self.bind(prefix, Namespace(mapped_iri))
            else:
                logger.error("couldn't make URI for %s", curie)
        return node
Exemplo n.º 9
0
    def _map_eom_terms(self, raw, limit=None):
        """
        This table contains the HP ID mappings from the local tsv file.
        Triples:
            <eom id> owl:equivalentClass <hp id>
        :param raw:
        :param limit:
        :return:
        """

        gu = GraphUtils(curie_map.get())

        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            for line in f1:
                line_counter += 1

                (morphology_term_id, morphology_term_label, hp_id, hp_label, notes) = line.split('\t')

                # Sub out the underscores for colons.
                hp_id = re.sub('_', ':', hp_id)
                if re.match(".*HP:.*", hp_id):
                    # add the HP term as a class
                    gu.addClassToGraph(self.graph, hp_id, None)
                    # Add the HP ID as an equivalent class
                    gu.addEquivalentClass(self.graph, morphology_term_id, hp_id)
                else:
                    logger.warning('No matching HP term for %s', morphology_term_label)

                if limit is not None and line_counter > limit:
                    break

        return
Exemplo n.º 10
0
    def _map_rel_id(orphanet_rel_id):
        # TODO check if these ids are stable for mapping
        rel_id = None
        gu = GraphUtils(curie_map.get())
        id_map = {
            # Disease-causing germline mutation(s) in
            '17949': gu.object_properties['has_phenotype'],
            # Disease-causing somatic mutation(s) in
            '17955': gu.object_properties['has_phenotype'],
            # Major susceptibility factor in
            '17961': gu.object_properties['contributes_to'],
            # Modifying germline mutation in
            '17967': gu.object_properties['contributes_to'],
            # Modifying somatic mutation in
            '17973': gu.object_properties['contributes_to'],
            # Part of a fusion gene in
            '17979': gu.object_properties['contributes_to'],
            # Role in the phenotype of
            '17985': gu.object_properties['contributes_to'],
            '18273': None,  # Candidate gene tested in  FIXME?
            # Disease-causing germline mutation(s) (loss of function) in
            '25979': gu.object_properties['has_phenotype'],  # comma added ?!!
            # Disease-causing germline mutation(s) (gain of function) in
            '25972': gu.object_properties['has_phenotype'],
        }

        if orphanet_rel_id in id_map:
            rel_id = id_map[orphanet_rel_id]
        else:
            logger.error(
                'Disease-gene association type (%s) not mapped.',
                orphanet_rel_id)

        return rel_id
Exemplo n.º 11
0
    def setUp(self):
        self.assoc_curie = 'MONARCH:test_association'
        self.eco_id = 'ECO:0000015'

        self.test_set_1 = ('MGI:1920145', 'Setd5', 'WTSI', 'MEFW', 'male',
                           'heterozygote', 'MGI:4432631', 'Setd5<tm1a(EUCOMM)Wtsi>',
                           'targeted mutation 1a, Wellcome Trust Sanger Institute',
                           'MGI:2159965', 'C57BL/6N', 'MGP',
                           'Wellcome Trust Sanger Institute Mouse Genetics Project',
                           'MGP Select Pipeline', 'MGP_001', 'MGP_XRY_001', 'X-ray',
                           'IMPC_XRY_008_001', 'Number of ribs right', 'MP:0005390',
                           'skeleton phenotype', 'MP:0000480', 'increased rib number',
                           '1.637023E-010', '', '8.885439E-007',
                           'Wilcoxon rank sum test with continuity correction', 'IMPC')

        # Generate test curies, these are otherwise generated
        # within _add_evidence() and _add_study_provenance()
        self.study_curie = "_:study"
        self.evidence_curie = "_:evidence"

        # IRIs for testing sparql output
        curie_dict = curie_map.get()
        curie_util = CurieUtil(curie_dict)
        self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie))

        return
Exemplo n.º 12
0
    def parse(self, limit=None):
        if limit is not None:
            logger.info("Only parsing first %s rows of each file", limit)

        if self.version_num is None:
            import os
            logger.info("Figuring out version num for files")
            # probe the raw directory for the WSnumber on
            # the "letter.WS###" file.
            # this is the only one that we keep the version number on
            files = os.listdir(self.rawdir)
            letter_file = next(f for f in files if re.match(r'letter', f))
            vernum = re.search(r'(WS\d+)', letter_file)
            self.update_wsnum_in_files(vernum.group(1))

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        self.nobnodes = True  # FIXME
        # to hold any label for a given id
        self.id_label_map = {}
        # to hold the mappings between genotype and background
        self.genotype_backgrounds = {}
        self.extrinsic_id_to_enviro_id_hash = {}
        # to hold the genes variant due to a seq alt
        self.variant_loci_genes = {}
        # to hold the parts of an environment
        self.environment_hash = {}
        self.wildtype_genotypes = []
        # stores the rnai_reagent to gene targets
        self.rnai_gene_map = {}

        self.process_gene_ids(limit)
        # self.process_gene_desc(limit)   #TEC imput file is mia 2016-Mar-03
        self.process_allele_phenotype(limit)
        self.process_rnai_phenotypes(limit)
        self.process_pub_xrefs(limit)
        self.process_feature_loc(limit)
        self.process_disease_association(limit)
        # TODO add this when when complete
        # self.process_gene_interaction(limit)

        logger.info("Finished parsing.")

        self.load_bindings()
        gu = GraphUtils(curie_map.get())
        gu.loadAllProperties(g)
        gu.loadObjectProperties(g, Genotype.object_properties)

        logger.info("Found %d nodes in graph", len(self.graph))
        logger.info("Found %d nodes in testgraph", len(self.testgraph))

        return
Exemplo n.º 13
0
    def _get_phenotypicseries_parents(entry, g):
        """
        Extract the phenotypic series parent relationship out of the entry
        :param entry:
        :return:
        """
        gu = GraphUtils(curie_map.get())
        omimid = 'OMIM:'+str(entry['mimNumber'])
        # the phenotypic series mappings
        serieslist = []
        if 'phenotypicSeriesExists' in entry:
            if entry['phenotypicSeriesExists'] is True:
                if 'phenotypeMapList' in entry:
                    phenolist = entry['phenotypeMapList']
                    for p in phenolist:
                        serieslist.append(p['phenotypeMap']['phenotypicSeriesNumber'])
                if 'geneMap' in entry and 'phenotypeMapList' in entry['geneMap']:
                    phenolist = entry['geneMap']['phenotypeMapList']
                    for p in phenolist:
                        if 'phenotypicSeriesNumber' in p['phenotypeMap']:
                            serieslist.append(p['phenotypeMap']['phenotypicSeriesNumber'])
        # add this entry as a subclass of the series entry
        for ser in serieslist:
            series_id = 'OMIM:'+ser
            gu.addClassToGraph(g, series_id, None)
            gu.addSubclass(g, series_id, omimid)

        return
Exemplo n.º 14
0
    def _process_phenotypicseries(self, limit):
        """
        Creates classes from the OMIM phenotypic series list.  These are grouping classes
        to hook the more granular OMIM diseases.
        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        logger.info("getting phenotypic series titles")
        gu = GraphUtils(curie_map.get())
        line_counter = 0
        start = False
        with open('/'.join((self.rawdir, self.files['phenotypicSeries']['file']))) as f:
            for line in f:
                # there's several lines of header in the file, so need to skip several lines:
                if not start:
                    if re.match('Phenotypic Series', line):
                        start = True
                    continue
                if re.match('\w*$', line):
                    # skip blank lines
                    continue
                line = line.strip()
                line_counter += 1
                (ps_label, ps_num) = line.split('\t')
                omim_id = 'OMIM:'+ps_num
                gu.addClassToGraph(g, omim_id, ps_label)

        return
Exemplo n.º 15
0
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new
        gene id is the replacement for it.  The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:
        """
        gu = GraphUtils(curie_map.get())
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene_history']['file']))
        logger.info("FILE: %s", myfile)
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue
                (tax_num, gene_num, discontinued_num, discontinued_symbol, discontinued_date) = line.split('\t')

                ##### set filter=None in init if you don't want to have a filter
                #if self.filter is not None:
                #    if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
                #            or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
                #        continue
                ##### end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))
                tax_id = ':'.join(('NCBITaxon', tax_num))

                # add the two genes
                gu.addClassToGraph(g, gene_id, None)
                gu.addClassToGraph(g, discontinued_gene_id, discontinued_symbol)

                # add the new gene id to replace the old gene id
                gu.addDeprecatedClass(g, discontinued_gene_id, [gene_id])

                # also add the old symbol as a synonym of the new gene
                gu.addSynonym(g, gene_id, discontinued_symbol)

                if (not self.testMode) and (limit is not None and line_counter > limit):
                    break

        return
Exemplo n.º 16
0
    def addRefToGraph(self, g):

        gu = GraphUtils(curie_map.get())

        n = self.short_citation
        if n is None:
            n = self.title

        if self.ref_url is not None:
            ref_uri = URIRef(self.ref_url)
            g.add((ref_uri, DC['title'], Literal(self.title)))
            g.add((ref_uri, RDF['type'], gu.getNode(self.ref_type)))
            g.add((ref_uri, RDFS['label'], Literal(n)))
        elif self.ref_id is not None:
            gu.addIndividualToGraph(g, self.ref_id, n, self.ref_type)
            if self.title is not None:
                gu.addTitle(g, self.ref_id, self.title)
        else:
            # should never be true
            logger.error("You are missing an identifier for a reference.")

        # TODO what is the property here to add the date?
        # if self.year is not None:
        #    gu.addTriple()

        # if self.author_list is not None:
        #    for a in self.author_list:
        #        gu.addTriple(
        #           g, self.ref_id, self.props['has_author'], a, True)
        return
Exemplo n.º 17
0
    def _get_process_allelic_variants(self, entry, g):
        gu = GraphUtils(curie_map.get())
        geno = Genotype(g)
        du = DipperUtil()
        if entry is not None:
            publist = {}  # to hold the entry-specific publication mentions for the allelic variants
            entry_num = entry['mimNumber']

            # process the ref list just to get the pmids
            ref_to_pmid = self._get_pubs(entry, g)

            if 'allelicVariantList' in entry:
                allelicVariantList = entry['allelicVariantList']
                for al in allelicVariantList:
                    al_num = al['allelicVariant']['number']
                    al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4)
                    al_label = None
                    al_description = None
                    if al['allelicVariant']['status'] == 'live':
                        publist[al_id] = set()
                        if 'mutations' in al['allelicVariant']:
                            al_label = al['allelicVariant']['mutations']
                        if 'text' in al['allelicVariant']:
                            al_description = al['allelicVariant']['text']
                            m = re.findall('\{(\d+)\:', al_description)
                            publist[al_id] = set(m)
                        geno.addAllele(al_id, al_label, geno.genoparts['variant_locus'], al_description)
                        geno.addAlleleOfGene(al_id, 'OMIM:'+str(entry_num),
                                             geno.object_properties['is_sequence_variant_instance_of'])
                        for r in publist[al_id]:
                            pmid = ref_to_pmid[int(r)]
                            gu.addTriple(g, pmid, gu.object_properties['is_about'], al_id)
                        # look up the pubmed id in the list of references
                        if 'dbSnps' in al['allelicVariant']:
                            dbsnp_ids = re.split(',', al['allelicVariant']['dbSnps'])
                            for dnum in dbsnp_ids:
                                did = 'dbSNP:'+dnum.strip()
                                gu.addIndividualToGraph(g, did, None)
                                gu.addEquivalentClass(g, al_id, did)
                        if 'clinvarAccessions' in al['allelicVariant']:
                            # clinvarAccessions triple semicolon delimited, each lik eRCV000020059;;1
                            rcv_ids = re.split(';;;', al['allelicVariant']['clinvarAccessions'])
                            rcv_ids = [(re.match('(RCV\d+)\;\;', r)).group(1) for r in rcv_ids]
                            for rnum in rcv_ids:
                                rid = 'ClinVar:'+rnum
                                gu.addXref(g, al_id, rid)
                        gu.addPage(g, al_id, "http://omim.org/entry/"+str(entry_num)+"#"+str(al_num).zfill(4))
                    elif re.search('moved', al['allelicVariant']['status']):
                        # for both 'moved' and 'removed'
                        moved_ids = None
                        if 'movedTo' in al['allelicVariant']:
                            moved_id = 'OMIM:'+al['allelicVariant']['movedTo']
                            moved_ids = [moved_id]
                        gu.addDeprecatedIndividual(g, al_id, moved_ids)
                    else:
                        logger.error('Uncaught alleleic variant status %s', al['allelicVariant']['status'])
                # end loop allelicVariantList

        return
Exemplo n.º 18
0
    def process_disease_association(self, limit):

        raw = '/'.join((self.rawdir, self.files['disease_assoc']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())

        logger.info("Processing disease models")
        geno = Genotype(g, self.nobnodes)
        line_counter = 0
        worm_taxon = 'NCBITaxon:6239'
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, disease_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                # WB	WBGene00000001	aap-1		DOID:2583	PMID:19029536	IEA	ENSEMBL:ENSG00000145675|OMIM:615214	D		Y110A7A.10	gene	taxon:6239	20150612	WB
                gene_id = 'WormBase:'+gene_num

                # make a variant of the gene
                vl = '_'+'-'.join((gene_num, 'unspecified'))
                if self.nobnodes:
                    vl = ':'+vl
                vl_label = 'some variant of '+gene_symbol
                geno.addAlleleOfGene(vl, gene_id)
                animal_id = geno.make_experimental_model_with_genotype(
                    g, vl, vl_label, worm_taxon, 'worm')

                assoc = G2PAssoc(
                    self.name, animal_id,
                    disease_id, gu.object_properties['model_of'])
                ref = re.sub(r'WB_REF:', 'WormBase:', ref)
                if ref != '':
                    assoc.add_source(ref)
                eco_id = None
                if eco_symbol == 'IEA':
                    eco_id = 'ECO:0000501'  # IEA is this now
                if eco_id is not None:
                    assoc.add_evidence(eco_id)

                assoc.add_association_to_graph(g)

        return
Exemplo n.º 19
0
    def __init__(self, are_bnodes_skized=True):
        super().__init__()
        self.are_bnodes_skized = are_bnodes_skized

        # Can be removed when this is resolved
        # https://github.com/RDFLib/rdflib/issues/632
        obo_map = curie_map.get()['OBO']
        self.bind('OBO', Namespace(obo_map))
Exemplo n.º 20
0
    def _process_genes(self, taxid, limit=None):
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)

        raw = '/'.join((self.rawdir, self.files[taxid]['file']))
        line_counter = 0
        logger.info("Processing Ensembl genes for tax %s", taxid)
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t')
            for row in filereader:
                if len(row) < 4:
                    logger.error("Data error for file %s", raw)
                    return
                (ensembl_gene_id, external_gene_name, description,
                 gene_biotype, entrezgene) = row[0:5]

                # in the case of human genes, we also get the hgnc id,
                # and is the last col
                if taxid == '9606':
                    hgnc_id = row[5]
                else:
                    hgnc_id = None

                if self.testMode and entrezgene != '' \
                        and int(entrezgene) not in self.gene_ids:
                    continue

                line_counter += 1
                gene_id = 'ENSEMBL:'+ensembl_gene_id
                if description == '':
                    description = None
                gene_type_id = self._get_gene_type(gene_biotype)
                gene_type_id = None
                gu.addClassToGraph(
                    g, gene_id, external_gene_name, gene_type_id, description)

                if entrezgene != '':
                    gu.addEquivalentClass(g, gene_id, 'NCBIGene:'+entrezgene)
                if hgnc_id is not None and hgnc_id != '':
                    gu.addEquivalentClass(g, gene_id, hgnc_id)
                geno.addTaxon('NCBITaxon:'+taxid, gene_id)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
        gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
        gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        return
Exemplo n.º 21
0
    def __init__(self, graph):

        self.gu = GraphUtils(curie_map.get())

        self.graph = graph

        self.gu.loadProperties(self.graph, self.object_properties, self.gu.OBJPROP)

        return
Exemplo n.º 22
0
    def __init__(self, are_bnodes_skized=True, identifier=None):
        # print("in RDFGraph  with id: ", identifier)
        super().__init__('IOMemory', identifier)
        self.are_bnodes_skized = are_bnodes_skized

        # Can be removed when this is resolved
        # https://github.com/RDFLib/rdflib/issues/632
        obo_map = curie_map.get()['OBO']
        self.bind('OBO', Namespace(obo_map))
Exemplo n.º 23
0
    def __init__(self, definedby):
        self.cu = CurieUtil(curie_map.get())
        self.gu = GraphUtils(curie_map.get())

        # core parts of the association
        self.definedby = definedby
        self.sub = self.obj = self.rel = None
        self.assoc_id = None

        self.description = None
        self.source = []
        self.evidence = []

        self.score = None
        self.score_type = None
        self.score_unit = None

        return
Exemplo n.º 24
0
    def process_gene_desc(self, limit):
        raw = '/'.join((self.rawdir, self.files['gene_desc']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())

        logger.info("Processing Gene descriptions")
        line_counter = 0
        # geno = Genotype(g)  # TODO unused
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t',
                quotechar='\"')
            for row in filereader:
                if re.match(r'\#', ''.join(row)):
                    continue
                line_counter += 1
                if line_counter == 1:
                    continue
                (gene_num, public_name, molecular_name, concise_description,
                 provisional_description, detailed_description,
                 automated_description, gene_class_description) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                gene_id = 'WormBase:'+gene_num

                if concise_description != 'none available':
                    gu.addDefinition(g, gene_id, concise_description)

                # remove the description if it's identical to the concise
                descs = {
                    'provisional': provisional_description,
                    'automated': automated_description,
                    'detailed': detailed_description,
                    'gene class': gene_class_description
                }
                for d in descs:
                    text = descs.get(d)
                    if text == concise_description \
                            or re.match(r'none', text) or text == '':
                        pass  # don't use it
                    else:
                        text = ' '.join((text, '['+d+']'))
                        descs[d] = text
                        gu.addDescription(g, gene_id, text)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Exemplo n.º 25
0
    def _get_gene2pubmed(self, limit):
        """
        Loops through the gene2pubmed file and adds a simple triple to say that a given publication
        is_about a gene.  Publications are added as NamedIndividuals.
        :param limit:
        :return:
        """

        gu = GraphUtils(curie_map.get())
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        is_about = gu.getNode(gu.object_properties['is_about'])
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene2pubmed']['file']))
        logger.info("FILE: %s", myfile)
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue
                (tax_num, gene_num, pubmed_num) = line.split('\t')

                ##### set filter=None in init if you don't want to have a filter
                #if self.filter is not None:
                #    if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
                #       or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
                #        continue
                ##### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if int(tax_num) not in self.tax_ids:
                    continue

                if gene_num == '-' or pubmed_num == '-':
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                pubmed_id = ':'.join(('PMID', pubmed_num))

                # add the gene, in case it hasn't before
                gu.addClassToGraph(g, gene_id, None)
                # add the publication as a NamedIndividual
                gu.addIndividualToGraph(g, pubmed_id, None, None)  # add type publication
                self.graph.add((gu.getNode(pubmed_id), is_about, gu.getNode(gene_id)))

                if not self.testMode and limit is not None and line_counter > limit:
                    break

        return
Exemplo n.º 26
0
 def __init__(self, id, label, type, description=None):
     self.id = id
     self.label = label
     self.type = type
     self.description = description
     self.gu = GraphUtils(curie_map.get())
     self.start = None
     self.stop = None
     self.nobnodes = True  # TODO remove this before official release
     return
Exemplo n.º 27
0
    def _process_straininfo(self, limit):
        # line_counter = 0  # TODO unused
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        logger.info("Processing measurements ...")
        raw = '/'.join((self.rawdir, self.files['straininfo']['file']))

        tax_id = 'NCBITaxon:10090'

        gu = GraphUtils(curie_map.get())

        with open(raw, 'r') as f:
            reader = csv.reader(f, delimiter=',', quotechar='\"')
            f.readline()  # read the header row; skip
            for row in reader:
                (strain_name, vendor, stocknum, panel, mpd_strainid,
                 straintype, n_proj, n_snp_datasets, mpdshortname, url) = row
                # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html
                # create the strain as an instance of the taxon
                if self.testMode and \
                        'MPD:'+str(mpd_strainid) not in self.test_ids:
                    continue
                strain_id = 'MPD-strain:'+str(mpd_strainid)
                gu.addIndividualToGraph(g, strain_id, strain_name, tax_id)
                if mpdshortname.strip() != '':
                    gu.addSynonym(g, strain_id, mpdshortname.strip())
                self.idlabel_hash[strain_id] = strain_name
                # make it equivalent to the vendor+stock
                if stocknum != '':
                    if vendor == 'J':
                        jax_id = 'JAX:'+stocknum
                        gu.addSameIndividual(g, strain_id, jax_id)
                    elif vendor == 'Rbrc':
                        # reiken
                        reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum)
                        gu.addSameIndividual(g, strain_id, reiken_id)
                    else:
                        if url != '':
                            gu.addXref(g, strain_id, url, True)
                        if vendor != '':
                            gu.addXref(
                                g, strain_id, ':'.join((vendor, stocknum)),
                                True)

                # add the panel information
                if panel != '':
                    desc = panel+' [panel]'
                    gu.addDescription(g, strain_id, desc)

                # TODO make the panels as a resource collection

        return
Exemplo n.º 28
0
    def __init__(self, definedby):
        self.cu = CurieUtil(curie_map.get())
        self.gu = GraphUtils(curie_map.get())

        # core parts of the association
        self.definedby = definedby
        self.sub = self.obj = self.rel = None
        self.assoc_id = None

        self.description = None
        self.source = []
        self.evidence = []
        # this is going to be used for the refactored evidence/provenance
        self.provenance = []

        self.score = None
        self.score_type = None
        self.score_unit = None

        return
Exemplo n.º 29
0
    def process_pub_xrefs(self, limit=None):

        raw = '/'.join((self.rawdir, self.files['pub_xrefs']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())

        logger.info("Processing publication xrefs")
        line_counter = 0
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (wb_ref, xref) = row
                # WBPaper00000009 pmid8805<BR>
                # WBPaper00000011 doi10.1139/z78-244<BR>
                # WBPaper00000012 cgc12<BR>

                if self.testMode and wb_ref not in self.test_ids['pub']:
                    continue

                ref_id = 'WormBase:'+wb_ref
                xref_id = r = None
                xref = re.sub(r'<BR>', '', xref)
                xref = xref.strip()
                if re.match(r'pmid', xref):
                    xref_id = 'PMID:'+re.sub(r'pmid\s*', '', xref)
                    r = Reference(
                        xref_id, Reference.ref_types['journal_article'])
                elif re.search(r'[\(\)\<\>\[\]\s]', xref):
                    continue
                elif re.match(r'doi', xref):
                    xref_id = 'DOI:'+re.sub(r'doi', '', xref.strip())
                    r = Reference(xref_id)
                elif re.match(r'cgc', xref):
                    # TODO not sure what to do here with cgc xrefs
                    continue
                else:
                    # logger.debug("Other xrefs like %s", xref)
                    continue

                if xref_id is not None:
                    r.addRefToGraph(g)
                    gu.addSameIndividual(g, ref_id, xref_id)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Exemplo n.º 30
0
    def _process_ortholog_classes(self, limit=None):
        """
        This method add the KEGG orthology classes to the graph.

        Triples created:
        <orthology_class_id> is a class
        <orthology_class_id> has label <orthology_symbols>
        <orthology_class_id> has description <orthology_description>
        :param limit:
        :return:
        """

        logger.info("Processing ortholog classes")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (orthology_class_id, orthology_class_name) = row

                if self.testMode and orthology_class_id not in self.test_ids['ortholog_classes']:
                    continue

                # FIXME: What's the proper route for this?
                # The orthology class is essentially a KEGG gene ID that is species agnostic.
                # Add the ID and label as a class. Would it be considered a gene as well?

                other_labels = re.split(';', orthology_class_name)
                orthology_label = other_labels[0]  # the first one is the label we'll use

                orthology_class_id = 'KEGG-'+orthology_class_id.strip()

                orthology_type = OrthologyAssoc.terms['gene_family']
                gu.addClassToGraph(g, orthology_class_id, orthology_label, orthology_type)
                if len(other_labels) > 1:
                    # add the rest as synonyms
                    # todo skip the first
                    for s in other_labels:
                        gu.addSynonym(g, orthology_class_id, s)

                    # add the last one as the description
                    gu.addDescription(g, orthology_class_id, other_labels[len(other_labels)-1])

                if (not self.testMode) and (limit is not None and line_counter > limit):
                    break

        logger.info("Done with ortholog classes")
        return
Exemplo n.º 31
0
class RDFGraph(DipperGraph, ConjunctiveGraph):
    """
    Extends RDFLibs ConjunctiveGraph
    The goal of this class is wrap the creation
    of triples and manage creation of URIRef,
    Bnodes, and literals from an input curie
    """

    curie_map = curie_map_class.get()
    curie_util = CurieUtil(curie_map)

    # make global translation table available outside the ingest
    with open(
            os.path.join(
                os.path.dirname(__file__),
                '../../translationtable/GLOBAL_TERMS.yaml')) as fhandle:
        globaltt = yaml.safe_load(fhandle)
        globaltcid = {v: k for k, v in globaltt.items()}

    def __init__(self, are_bnodes_skized=True, identifier=None):
        # print("in RDFGraph  with id: ", identifier)
        super().__init__('IOMemory', identifier)
        self.are_bnodes_skized = are_bnodes_skized
        self.prefixes = set()

        # Can be removed when this is resolved
        # https://github.com/RDFLib/rdflib/issues/632
# 2020 oct. possibly fixed
# for pfx in ('OBO',):  # , 'ORPHA'):
#    self.bind(pfx, Namespace(self.curie_map[pfx]))

    def _make_category_triple(self,
                              subject,
                              category,
                              predicate=blv.terms['category']):
        """
        add a triple to capture subject or object category (in CURIE form) that was
        passed to addTriple()
        """
        try:
            self.add((self._getnode(subject), self._getnode(predicate),
                      self._getnode(category)))
        except:
            LOG.warning(
                "Problem adding triple in _makeCategoryTriple for " + \
                "subj: %s pred: %s obj(category): %s",
                subject, predicate, category)

    def _is_literal(self, thing):
        """
        make inference on type (literal or CURIE)

        return: logical
        """
        if self.curie_regexp.match(thing) is not None or\
           thing.split(':')[0].lower() in ('http', 'https', 'ftp'):
            object_is_literal = False
        else:
            object_is_literal = True

        return object_is_literal

    def addTriple(self,
                  subject_id,
                  predicate_id,
                  obj,
                  object_is_literal=None,
                  literal_type=None,
                  subject_category=None,
                  object_category=None):

        if object_is_literal is None:
            object_is_literal = self._is_literal(obj)

        # add triples for subject category info
        if subject_category is not None:
            self._make_category_triple(subject_id, subject_category)

        # add triples for obj category info, if obj is not a literal
        if not object_is_literal:
            if object_category is not None:
                self._make_category_triple(obj, object_category)
        else:  # emit warning if object category is given for a literal
            if object_category is not None:
                LOG.warning(
                    "I was given a category %s for obj: %s, " +
                    "which seems to be a literal!", object_category, obj)

        if object_is_literal is True:
            if isinstance(obj, str):
                re.sub(r'[\t\n\r\f\v]+', ' ', obj)  # reduce any ws to a space
            if literal_type is not None and obj is not None and obj not in (
                    "", " "):
                literal_type_iri = self._getnode(literal_type)

                self.add(
                    (self._getnode(subject_id), self._getnode(predicate_id),
                     Literal(obj, datatype=literal_type_iri)))
            elif obj is not None:
                # could attempt to infer a type here but there is no use case
                self.add(
                    (self._getnode(subject_id), self._getnode(predicate_id),
                     Literal(obj)))
            else:
                LOG.warning("None as literal object for subj: %s and pred: %s",
                            subject_id, predicate_id)
                # get a sense of where the None is comming from
                # magic number here is "steps up the call stack"
                # TODO there may be easier/ideomatic ways to do this now
                for call in range(2, 0, -1):
                    LOG.warning('\t%sfrom: %s', '\t' * call,
                                sys._getframe(call).f_code.co_name)

        elif obj is not None and obj != '':  # object is a resource
            self.add((self._getnode(subject_id), self._getnode(predicate_id),
                      self._getnode(obj)))
        else:
            LOG.warning("None/empty object IRI for subj: %s and pred: %s",
                        subject_id, predicate_id)

    def skolemizeBlankNode(self, curie):
        stripped_id = re.sub(r'^_:|^_', '', curie, 1)
        return URIRef(self.curie_map['BNODE'] + stripped_id)

    def _getnode(self, curie):
        """
        This is a wrapper for creating a URIRef or Bnode object
        with a given a curie or iri as a string.

        If an id starts with an underscore, it assigns it to a BNode, otherwise
        it creates it with a standard URIRef.
        Alternatively, self.skolemize_blank_node is True,
        it will skolemize the blank node

        :param curie: str identifier formatted as curie or iri
        :return: node: RDFLib URIRef or BNode object
        """
        node = None
        if curie[0] == '_':
            if self.are_bnodes_skized:
                node = self.skolemizeBlankNode(curie)
            else:  # delete the leading underscore to make it cleaner
                node = BNode(re.sub(r'^_:|^_', '', curie, 1))

        # Check if curie string is actually an IRI
        elif curie[:4] == 'http' or curie[:3] == 'ftp' or curie[:4] == 'jdbc':
            node = URIRef(curie)
        else:
            iri = RDFGraph.curie_util.get_uri(curie)
            if iri is not None:
                node = URIRef(iri)
                # Bind prefix map to graph
                prefix = curie.split(':')[0]
                self.prefixes.add(prefix)
            else:
                LOG.error("couldn't make URI for %s", curie)
                # get a sense of where the CURIE-ish? thing is comming from
                # magic number here is "steps up the call stack"
                for call in range(3, 0, -1):
                    LOG.warning('\t%sfrom: %s', '\t' * call,
                                sys._getframe(call).f_code.co_name)
        return node

    def bind_all_namespaces(self):
        """
            Results in the RDF @prefix directives for every ingest
            being added to this ingest.
        """
        for prefix in self.curie_map.keys():
            iri = self.curie_map[prefix]
            self.bind(prefix, Namespace(iri))

    # serialize() conflicts between rdflib & Graph.serialize abstractmethod
    # GraphUtils expects the former.  (too bad there is no multiple dispatch)
    # rdflib version
    def serialize(self,
                  destination=None,
                  format='turtle',
                  base=None,
                  encoding=None):
        for prefix in self.prefixes:
            mapped_iri = self.curie_map[prefix]
            self.bind(prefix, Namespace(mapped_iri))
        return ConjunctiveGraph.serialize(self, destination, format)
Exemplo n.º 32
0
    def _add_variant_trait_association(self,
                                       variant_id,
                                       mapped_trait_uri,
                                       efo_ontology,
                                       pubmed_id,
                                       description=None):
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        # make associations to the EFO terms; there can be >1
        if mapped_trait_uri.strip() != '':
            for trait in re.split(r',', mapped_trait_uri):
                trait = trait.strip()

                cu = CurieUtil(curie_map.get())
                trait_id = cu.get_curie(trait)

                dis_query = """
                    SELECT ?trait
                    WHERE {{
                        {0} rdfs:subClassOf+ EFO:0000408 .
                        {0} rdfs:label ?trait .
                    }}
                """.format(trait_id)

                query_result = efo_ontology.query(dis_query)
                if len(list(query_result)) > 0:
                    if re.match(r'^EFO', trait_id):
                        model.addClassToGraph(trait_id,
                                              list(query_result)[0][0],
                                              'DOID:4')

                phenotype_query = """
                    SELECT ?trait
                    WHERE {{
                        {0} rdfs:subClassOf+ EFO:0000651 .
                        {0} rdfs:label ?trait .
                    }}
                """.format(trait_id)

                query_result = efo_ontology.query(phenotype_query)
                if len(list(query_result)) > 0:
                    if re.match(r'^EFO', trait_id):
                        model.addClassToGraph(trait_id,
                                              list(query_result)[0][0],
                                              'UPHENO:0001001')

                pubmed_curie = 'PMID:' + pubmed_id

                ref = Reference(g, pubmed_curie,
                                Reference.ref_types['journal_article'])
                ref.addRefToGraph()

                assoc = G2PAssoc(g, self.name, variant_id, trait_id,
                                 model.object_properties['contributes_to'])
                assoc.add_source(pubmed_curie)
                # combinatorial evidence
                # used in automatic assertion
                eco_id = 'ECO:0000213'
                assoc.add_evidence(eco_id)

                if description is not None:
                    assoc.set_description(description)

                # FIXME score should get added to provenance/study
                # assoc.set_score(pvalue)
                assoc.add_association_to_graph()
Exemplo n.º 33
0
class StreamedGraph(DipperGraph):
    """
    Stream rdf triples to file or stdout
    Assumes a downstream process will sort then uniquify triples

    Theoretically could support both ntriple, rdfxml formats, for now
    just support nt
    """

    curie_map = curimap.get()
    curie_util = CurieUtil(curie_map)

    with open(
            os.path.join(
                os.path.dirname(__file__),
                '../../translationtable/GLOBAL_TERMS.yaml')) as fhandle:
        globaltt = yaml.safe_load(fhandle).copy()
        globaltcid = {v: k for k, v in globaltt.items()}

    def __init__(self,
                 are_bnodes_skized=True,
                 identifier=None,
                 file_handle=None,
                 fmt='nt'):
        self.are_bnodes_skized = are_bnodes_skized
        self.fmt = fmt
        self.file_handle = file_handle
        self.identifier = identifier

    def addTriple(self,
                  subject_id,
                  predicate_id,
                  obj,
                  object_is_literal=None,
                  literal_type=None):
        # trying making infrence on type of object if none is supplied
        if object_is_literal is None:
            if self.curie_regexp.match(obj) or\
                    obj.split(':')[0].lower() in ('http', 'https', 'ftp'):
                object_is_literal = False
        else:
            object_is_literal = True

        subject_iri = self._getnode(subject_id)
        predicate_iri = self._getnode(predicate_id)
        if not object_is_literal:
            obj = self._getnode(obj)

        if literal_type is not None:
            literal_type = self._getnode(literal_type)

        if obj is not None:
            self.serialize(subject_iri, predicate_iri, obj, object_is_literal,
                           literal_type)
        else:
            LOG.warning("Null value passed as object")
        return

    def skolemizeBlankNode(self, curie):
        base_iri = StreamedGraph.curie_map.get_base()
        curie_id = curie.split(':')[1]
        skolem_iri = "{0}.wellknown/genid/{1}".format(base_iri, curie_id)
        return skolem_iri

    def serialize(self,
                  subject_iri,
                  predicate_iri,
                  obj,
                  object_is_literal=False,
                  literal_type=None):
        if not object_is_literal:
            triple = "<{}> <{}> <{}> .".format(subject_iri, predicate_iri, obj)
        elif literal_type is not None:
            triple = '<{}> <{}> {}^^<{}> .'.format(
                subject_iri, predicate_iri, self._quote_encode(str(obj)),
                literal_type)
        else:
            if isinstance(obj, str):
                triple = '<{}> <{}> {} .'.format(subject_iri, predicate_iri,
                                                 self._quote_encode(obj))
            else:
                lit_type = self._getLiteralXSDType(obj)
                if type is not None:
                    triple = '<{}> <{}> "{}"^^<{}> .'.format(
                        subject_iri, predicate_iri, obj, lit_type)
                else:
                    raise TypeError("Cannot determine type of {}".format(obj))

        if self.file_handle is None:
            print(triple)
        else:
            self.file_handle.write("{}\n".format(triple))

    def _getnode(self, curie):
        """
        Returns IRI, or blank node curie/iri depending on
        self.skolemize_blank_node setting

        :param curie: str id as curie or iri
        :return:
        """
        if re.match(r'^_:', curie):
            if self.are_bnodes_skized is True:
                node = self.skolemizeBlankNode(curie)
            else:
                node = curie
        elif re.match(r'^http|^ftp', curie):
            node = curie
        elif len(curie.split(':')) == 2:
            node = StreamedGraph.curie_util.get_uri(curie)
        else:
            raise TypeError("Cannot process curie {}".format(curie))
        return node

    def _getLiteralXSDType(self, literal):
        """
        This could be much more nuanced, but for now
        if a literal is not a str, determine if it's
        a xsd int or double
        :param literal:
        :return: str - xsd full iri
        """
        if isinstance(literal, int):
            return self._getnode("xsd:integer")
        if isinstance(literal, float):
            return self._getnode("xsd:double")

    @staticmethod
    def _quote_encode(literal):
        """
        Copy of code in rdflib here:
        https://github.com/RDFLib/rdflib/blob/776b90be/
        rdflib/plugins/serializers/nt.py#L76
        :param literal:
        :return:
        """
        return '"%s"' % literal.replace('\\', '\\\\')\
            .replace('\n', '\\n')\
            .replace('"', '\\"')\
            .replace('\r', '\\r')
Exemplo n.º 34
0
    def setUp(self):
        self.test_util = TestUtils()
        self.assoc_curie = 'MONARCH:test_association'
        self.eco_id = 'ECO:0000015'

        # Headers:
        # 01 marker_accession_id,
        # 02 marker_symbol,
        # 03 phenotyping_center,
        # 04 colony_raw,
        # 05 sex,
        # 06 zygosity,
        # 07 allele_accession_id,
        # 08 allele_symbol,
        # 09 allele_name,
        # 10 strain_accession_id,
        # 11 strain_name,
        # 12 project_name,
        # 13 project_fullname,
        # 14 pipeline_name,
        # 15 pipeline_stable_id,
        # 16 procedure_stable_id,
        # 17 procedure_name,
        # 18 parameter_stable_id,
        # 19 parameter_name,
        # 20 top_level_mp_term_id,
        # 21 top_level_mp_term_name,
        # 22 mp_term_id,
        # 23 mp_term_name,
        # 24 p_value,
        # 25 percentage_change,
        # 26 effect_size,
        # 27 statistical_method,
        # 28 resource_name

        self.test_set_1 = (
            'MGI:1920145',  # 01
            'Setd5',  # 02
            'WTSI',  # 03
            'MEFW',  # 04
            'male',  # 05
            'heterozygote',  # 06
            'MGI:4432631',  # 07
            'Setd5<tm1a(EUCOMM)Wtsi>',  # 08
            'targeted mutation 1a, Wellcome Trust Sanger Institute',  # 09
            'MGI:2159965',  # 10
            'C57BL/6N',  # 11
            'MGP',  # 12
            'Wellcome Trust Sanger Institute Mouse Genetics Project',  # 13
            'MGP Select Pipeline',  # 14
            'MGP_001',  # 15
            'MGP_XRY_001',  # 16
            'X-ray',  # 17
            'IMPC_XRY_008_001',  # 18
            'Number of ribs right',  # 19
            'MP:0005390',  # 20
            'skeleton phenotype',  # 21
            'MP:0000480',  # 22
            'increased rib number',  # 23
            '1.637023E-010',  # 24
            '',  # 25
            '8.885439E-007',  # 26
            'Wilcoxon rank sum test with continuity correction',  # 27
            'IMPC'  # 28
        )

        # Generate test curies, these are otherwise generated
        # within _add_evidence() and _add_study_provenance()
        # these blank nodes are hardcoded as NOT Skolemized  ...
        self.study_curie = "_:study"
        self.evidence_curie = "_:evidence"

        # IRIs for testing sparql output
        curie_dict = curie_map.get()
        curie_util = CurieUtil(curie_dict)
        self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie))

        return
Exemplo n.º 35
0
 def setUp(self):
     self.graph = RDFGraph()
     self.curie_map = curie_map.get()
     self.genotype = Genotype(self.graph)
Exemplo n.º 36
0
 def setUp(self):
     self.graph = RDFGraph()
     self.curie_map = curie_map.get()