Python GraphUtils.addDescription примеры использования

Язык программирования: Python

Пространство имен/Пакет: dipper.utils.GraphUtils

Класс/Тип: GraphUtils

Метод/Функция: addDescription

Примеров на hotexamples.com: 12

Python GraphUtils.addDescription - 12 примеров найдено. Это лучшие примеры Python кода для dipper.utils.GraphUtils.GraphUtils.addDescription, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

loadProperties(22)

addSynonym(19)

loadAllProperties(19)

addEquivalentClass(14)

addDescription(12)

GraphUtils(10)

getNode(8)

digest_id(8)

addXref(8)

addDefinition(5)

addType(5)

addTriple(4)

makeLeader(4)

addDeprecatedClass(4)

addMemberOf(3)

addPage(3)

addSubclass(3)

addDeprecatedIndividual(3)

add_property_axioms(3)

addComment(3)

get_properties_from_graph(3)

write(2)

addOWLPropertyClassRestriction(2)

addClassToGraph(2)

addSameIndividual(2)

addMember(2)

addTitle(2)

addIndividualToGraph(2)

addDepiction(2)

addPerson(1)

loadObjectProperties(1)

addOntologyDeclaration(1)

addOWLVersionInfo(1)

addOWLVersionIRI(1)

write_raw_triples(1)

Пример #1

Показать файл

Файл: WormBase.py Проект: JervenBolleman/dipper

    def process_gene_desc(self, limit):
        raw = '/'.join((self.rawdir, self.files['gene_desc']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())

        logger.info("Processing Gene descriptions")
        line_counter = 0
        # geno = Genotype(g)  # TODO unused
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t',
                quotechar='\"')
            for row in filereader:
                if re.match(r'\#', ''.join(row)):
                    continue
                line_counter += 1
                if line_counter == 1:
                    continue
                (gene_num, public_name, molecular_name, concise_description,
                 provisional_description, detailed_description,
                 automated_description, gene_class_description) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                gene_id = 'WormBase:'+gene_num

                if concise_description != 'none available':
                    gu.addDefinition(g, gene_id, concise_description)

                # remove the description if it's identical to the concise
                descs = {
                    'provisional': provisional_description,
                    'automated': automated_description,
                    'detailed': detailed_description,
                    'gene class': gene_class_description
                }
                for d in descs:
                    text = descs.get(d)
                    if text == concise_description \
                            or re.match(r'none', text) or text == '':
                        pass  # don't use it
                    else:
                        text = ' '.join((text, '['+d+']'))
                        descs[d] = text
                        gu.addDescription(g, gene_id, text)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return

Пример #2

Показать файл

Файл: MPD.py Проект: JervenBolleman/dipper

    def _process_straininfo(self, limit):
        # line_counter = 0  # TODO unused
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        logger.info("Processing measurements ...")
        raw = '/'.join((self.rawdir, self.files['straininfo']['file']))

        tax_id = 'NCBITaxon:10090'

        gu = GraphUtils(curie_map.get())

        with open(raw, 'r') as f:
            reader = csv.reader(f, delimiter=',', quotechar='\"')
            f.readline()  # read the header row; skip
            for row in reader:
                (strain_name, vendor, stocknum, panel, mpd_strainid,
                 straintype, n_proj, n_snp_datasets, mpdshortname, url) = row
                # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html
                # create the strain as an instance of the taxon
                if self.testMode and \
                        'MPD:'+str(mpd_strainid) not in self.test_ids:
                    continue
                strain_id = 'MPD-strain:'+str(mpd_strainid)
                gu.addIndividualToGraph(g, strain_id, strain_name, tax_id)
                if mpdshortname.strip() != '':
                    gu.addSynonym(g, strain_id, mpdshortname.strip())
                self.idlabel_hash[strain_id] = strain_name
                # make it equivalent to the vendor+stock
                if stocknum != '':
                    if vendor == 'J':
                        jax_id = 'JAX:'+stocknum
                        gu.addSameIndividual(g, strain_id, jax_id)
                    elif vendor == 'Rbrc':
                        # reiken
                        reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum)
                        gu.addSameIndividual(g, strain_id, reiken_id)
                    else:
                        if url != '':
                            gu.addXref(g, strain_id, url, True)
                        if vendor != '':
                            gu.addXref(
                                g, strain_id, ':'.join((vendor, stocknum)),
                                True)

                # add the panel information
                if panel != '':
                    desc = panel+' [panel]'
                    gu.addDescription(g, strain_id, desc)

                # TODO make the panels as a resource collection

        return

Пример #3

Показать файл

Файл: KEGG.py Проект: d3borah/dipper

    def _process_ortholog_classes(self, limit=None):
        """
        This method add the KEGG orthology classes to the graph.

        Triples created:
        <orthology_class_id> is a class
        <orthology_class_id> has label <orthology_symbols>
        <orthology_class_id> has description <orthology_description>
        :param limit:
        :return:
        """

        logger.info("Processing ortholog classes")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (orthology_class_id, orthology_class_name) = row

                if self.testMode and orthology_class_id not in self.test_ids['ortholog_classes']:
                    continue

                # FIXME: What's the proper route for this?
                # The orthology class is essentially a KEGG gene ID that is species agnostic.
                # Add the ID and label as a class. Would it be considered a gene as well?

                other_labels = re.split(';', orthology_class_name)
                orthology_label = other_labels[0]  # the first one is the label we'll use

                orthology_class_id = 'KEGG-'+orthology_class_id.strip()

                orthology_type = OrthologyAssoc.terms['gene_family']
                gu.addClassToGraph(g, orthology_class_id, orthology_label, orthology_type)
                if len(other_labels) > 1:
                    # add the rest as synonyms
                    # todo skip the first
                    for s in other_labels:
                        gu.addSynonym(g, orthology_class_id, s)

                    # add the last one as the description
                    gu.addDescription(g, orthology_class_id, other_labels[len(other_labels)-1])

                if (not self.testMode) and (limit is not None and line_counter > limit):
                    break

        logger.info("Done with ortholog classes")
        return

Пример #4

Показать файл

Файл: WormBase.py Проект: JervenBolleman/dipper

    def process_gene_interaction(self, limit):
        """
        The gene interaction file includes identified interactions,
        that are between two or more gene (products).
        In the case of interactions with >2 genes, this requires creating
        groups of genes that are involved in the interaction.
        From the wormbase help list: In the example WBInteraction000007779
        it would likely be misleading to suggest that lin-12 interacts with
        (suppresses in this case) smo-1 ALONE or that lin-12 suppresses let-60
        ALONE; the observation in the paper; see Table V in paper PMID:15990876
        was that a lin-12 allele (heterozygous lin-12(n941/+)) could suppress
        the "multivulva" phenotype induced synthetically by simultaneous
        perturbation of BOTH smo-1 (by RNAi) AND let-60 (by the n2021 allele).
        So this is necessarily a three-gene interaction.

        Therefore, we can create groups of genes based on their "status" of
        Effector | Effected.

        Status:  IN PROGRESS

        :param limit:
        :return:

        """

        raw = '/'.join((self.rawdir, self.files['gene_interaction']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())

        logger.info("Processing gene interaction associations")
        line_counter = 0

        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t',
                quotechar="'")

            for row in filereader:
                line_counter += 1
                if re.match(r'#', ''.join(row)):
                    continue

                (interaction_num, interaction_type, interaction_subtype,
                 summary, citation) = row[0:5]
                print(row)
                interaction_id = 'WormBase:'+interaction_num

                # TODO deal with subtypes
                interaction_type_id = None
                if interaction_type == 'Genetic':
                    interaction_type_id = \
                        InteractionAssoc.interaction_object_properties[
                            'genetically_interacts_with']
                elif interaction_type == 'Physical':
                    interaction_type_id = \
                        InteractionAssoc.interaction_object_properties[
                            'molecularly_interacts_with']
                elif interaction_type == 'Regulatory':
                    interaction_type_id = \
                        InteractionAssoc.interaction_object_properties[
                            'regulates']
                else:
                    logger.info(
                        "An interaction type I don't understand %s",
                        interaction_type)

                num_interactors = (len(row) - 5) / 3
                if num_interactors != 2:
                    logger.info(
                        "Skipping interactions with !=2 participants:\n %s",
                        str(row))
                    continue

                gene_a_id = 'WormBase:'+row[5]
                gene_b_id = 'WormBase:'+row[8]

                if self.testMode \
                        and gene_a_id not in self.test_ids['gene'] \
                        and gene_b_id not in self.test_ids['gene']:
                    continue

                assoc = InteractionAssoc(
                    self.name, gene_a_id, gene_b_id, interaction_type_id)
                assoc.set_association_id(interaction_id)
                assoc.add_association_to_graph(g)
                assoc_id = assoc.get_association_id()
                # citation is not a pmid or WBref - get this some other way
                gu.addDescription(g, assoc_id, summary)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return

Пример #5

Показать файл

Файл: WormBase.py Проект: JervenBolleman/dipper

    def process_feature_loc(self, limit):

        raw = '/'.join((self.rawdir, self.files['feature_loc']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())

        logger.info("Processing Feature location and attributes")
        line_counter = 0
        geno = Genotype(g)
        strain_to_variant_map = {}
        build_num = self.version_num
        build_id = 'WormBase:'+build_num
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t',
                quotechar='\"')
            for row in filereader:
                if re.match(r'\#', ''.join(row)):
                    continue
                (chrom, db, feature_type_label, start, end, score, strand,
                 phase, attributes) = row

# I	interpolated_pmap_position	gene	1	559768	.	.	.	ID=gmap:spe-13;gmap=spe-13;status=uncloned;Note=-21.3602 cM (+/- 1.84 cM)
# I	WormBase	gene	3747	3909	.	-	.	ID=Gene:WBGene00023193;Name=WBGene00023193;interpolated_map_position=-21.9064;sequence_name=Y74C9A.6;biotype=snoRNA;Alias=Y74C9A.6
# I	absolute_pmap_position	gene	4119	10230	.	.	.	ID=gmap:homt-1;gmap=homt-1;status=cloned;Note=-21.8252 cM (+/- 0.00 cM)

                # dbs = re.split(
                #   r' ', 'assembly_component expressed_sequence_match Coding_transcript Genomic_canonical Non_coding_transcript Orfeome Promoterome Pseudogene RNAi_primary RNAi_secondary Reference Transposon Transposon_CDS cDNA_for_RNAi miRanda ncRNA operon polyA_signal_sequence polyA_site snlRNA')
                #
                # if db not in dbs:
                #     continue

                if feature_type_label not in [
                        'gene', 'point_mutation', 'deletion', 'RNAi_reagent',
                        'duplication', 'enhancer', 'binding_site',
                        'biological_region', 'complex_substitution',
                        'substitution', 'insertion', 'inverted_repeat']:
                    # note biological_regions include balancers
                    # other options here: promoter, regulatory_region, reagent
                    continue
                line_counter += 1

                attribute_dict = {}
                if attributes != '':
                    attribute_dict = dict(
                        item.split("=")for item in
                        re.sub(r'"', '', attributes).split(";"))

                fid = flabel = desc = None
                if 'ID' in attribute_dict:
                    fid = attribute_dict.get('ID')
                    if re.search(r'WB(Gene|Var|sf)', fid):
                        fid = re.sub(r'^\w+:WB', 'WormBase:WB', fid)
                    elif re.match(r'(gmap|landmark)', fid):
                        continue
                    else:
                        logger.info('other identifier %s', fid)
                        fid = None
                elif 'variation' in attribute_dict:
                    fid = 'WormBase:'+attribute_dict.get('variation')
                    flabel = attribute_dict.get('public_name')
                    sub = attribute_dict.get('substitution')
                    ins = attribute_dict.get('insertion')
                    # if it's a variation:
                    # variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T
                    desc = ''
                    if sub is not None:
                        desc = 'substitution='+sub
                    if ins is not None:
                        desc = 'insertion='+ins

                    # keep track of the strains with this variation,
                    # for later processing
                    strain_list = attribute_dict.get('strain')
                    if strain_list is not None:
                        for s in re.split(r',', strain_list):
                            if s.strip() not in strain_to_variant_map:
                                strain_to_variant_map[s.strip()] = set()
                            strain_to_variant_map[s.strip()].add(fid)

                # if feature_type_label == 'RNAi_reagent':
                    # Target=WBRNAi00096030 1 4942
                    # this will tell us where the RNAi is actually binding
                    # target = attribute_dict.get('Target') # TODO unused
                    # rnai_num = re.split(r' ', target)[0]  # TODO unused
                    # it will be the reagent-targeted-gene that has a position,
                    # (i think)
                    # TODO finish the RNAi binding location

                name = attribute_dict.get('Name')
                polymorphism = attribute_dict.get('polymorphism')

                if fid is None:
                    if name is not None and re.match(r'WBsf', name):
                        fid = 'WormBase:'+name
                        name = None
                    else:
                        continue

                if self.testMode \
                        and re.sub(r'WormBase:', '', fid) \
                        not in self.test_ids['gene']+self.test_ids['allele']:
                    continue

                # these really aren't that interesting
                if polymorphism is not None:
                    continue

                if name is not None and not re.search(name, fid):
                    if flabel is None:
                        flabel = name
                    else:
                        gu.addSynonym(g, fid, name)

                if desc is not None:
                    gu.addDescription(g, fid, desc)

                alias = attribute_dict.get('Alias')

                biotype = attribute_dict.get('biotype')
                note = attribute_dict.get('Note')
                other_name = attribute_dict.get('other_name')
                for n in [alias, other_name]:
                    if n is not None:
                        gu.addSynonym(g, fid, other_name)

                ftype = self.get_feature_type_by_class_and_biotype(
                    feature_type_label, biotype)

                chr_id = makeChromID(chrom, build_id, 'CHR')
                geno.addChromosomeInstance(chrom, build_id, build_num)

                f = Feature(fid, flabel, ftype)
                f.addFeatureStartLocation(start, chr_id, strand)
                f.addFeatureEndLocation(start, chr_id, strand)

                feature_is_class = False
                if feature_type_label == 'gene':
                    feature_is_class = True

                f.addFeatureToGraph(g, True, None, feature_is_class)

                if note is not None:
                    gu.addDescription(g, fid, note)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

                # RNAi reagents:
# I	RNAi_primary	RNAi_reagent	4184	10232	.	+	.	Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10
# I	RNAi_primary	RNAi_reagent	4223	10147	.	+	.	Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052
# I	RNAi_primary	RNAi_reagent	5693	9391	.	+	.	Target=WBRNAi00066135 1 3699 +;laboratory=CH

                # TODO TF bindiing sites and network:
# I	TF_binding_site_region	TF_binding_site	1861	2048	.	+	.	Name=WBsf292777;tf_id=WBTranscriptionFactor000025;tf_name=DAF-16
# I	TF_binding_site_region	TF_binding_site	3403	4072	.	+	.	Name=WBsf331847;tf_id=WBTranscriptionFactor000703;tf_name=DPL-1

        return

Пример #6

Показать файл

Файл: Coriell.py Проект: JervenBolleman/dipper

    def _process_data(self, raw, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

            line_id a CL_0000057,  #fibroblast line
                derives_from patient_id
                part_of :NIGMSrepository
                RO:model_of OMIM:disease_id

            patient id a foaf:person,
                label: "fibroblast from patient 12345 with disease X"
                member_of family_id  #what is the right thing here?
                SIO:race EFO:caucasian  #subclass of EFO:0001799
                in_taxon NCBITaxon:9606
                dc:description Literal(remark)
                RO:has_phenotype OMIM:disease_id
                GENO:has_genotype genotype_id

            family_id a owl:NamedIndividual
                foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

            genotype_id a intrinsic_genotype
                GENO:has_alternate_part allelic_variant_id
                we don't necessarily know much about the genotype,
                other than the allelic variant. also there's the sex here

            pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:
        """
        logger.info("Processing Data from %s", raw)
        gu = GraphUtils(curie_map.get())

        if self.testMode:      # set the graph to build
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        geno = Genotype(g)
        du = DipperUtil()

        gu.loadProperties(g, geno.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                if not row:
                    pass
                else:
                    line_counter += 1

                    (catalog_id, description, omim_number, sample_type,
                     cell_line_available, dna_in_stock, dna_ref, gender, age,
                     race, ethnicity, affected, karyotype, relprob, mutation,
                     gene, family_id, collection, url, cat_remark, pubmed_ids,
                     family_member, variant_id, dbsnp_id, species) = row

                    # example:
                    # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,,
                    # parent,,,39,NIGMS Human Genetic Cell Repository,
                    # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                    # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,,
                    # 2,,18343,H**o sapiens

                    if self.testMode and catalog_id not in self.test_lines:
                        # skip rows not in our test lines, when in test mode
                        continue

                    # ###########    BUILD REQUIRED VARIABLES    ###########

                    # Make the cell line ID
                    cell_line_id = 'Coriell:'+catalog_id.strip()

                    # Map the cell/sample type
                    cell_type = self._map_cell_type(sample_type)

                    # Make a cell line label
                    line_label = \
                        collection.partition(' ')[0]+'-'+catalog_id.strip()

                    # Map the repository/collection
                    repository = self._map_collection(collection)

                    # patients are uniquely identified by one of:
                    # dbsnp id (which is == an individual haplotype)
                    # family id + family member (if present) OR
                    # probands are usually family member zero
                    # cell line id
                    # since some patients have >1 cell line derived from them,
                    # we must make sure that the genotype is attached to
                    # the patient, and can be inferred to the cell line
                    # examples of repeated patients are:
                    #   famid=1159, member=1; fam=152,member=1

                    # Make the patient ID

                    # make an anonymous patient
                    patient_id = '_person'
                    if self.nobnodes:
                        patient_id = ':'+patient_id
                    if family_id != '':
                        patient_id = \
                            '-'.join((patient_id, family_id, family_member))
                    else:
                        # make an anonymous patient
                        patient_id = '-'.join((patient_id, catalog_id.strip()))

                    # properties of the individual patients:  sex, family id,
                    # member/relproband, description descriptions are
                    # really long and ugly SCREAMING text, so need to clean up
                    # the control cases are so odd with this labeling scheme;
                    # but we'll deal with it as-is for now.
                    short_desc = (description.split(';')[0]).capitalize()
                    if affected == 'Yes':
                        affected = 'affected'
                    elif affected == 'No':
                        affected = 'unaffected'
                    gender = gender.lower()
                    patient_label = ' '.join((affected, gender, relprob))
                    if relprob == 'proband':
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'with', short_desc))
                    else:
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'of proband with',
                                 short_desc))

                    # #############    BUILD THE CELL LINE    #############

                    # Adding the cell line as a typed individual.
                    cell_line_reagent_id = 'CLO:0000031'

                    gu.addIndividualToGraph(
                        g, cell_line_id, line_label, cell_line_reagent_id)

                    # add the equivalent id == dna_ref
                    if dna_ref != '' and dna_ref != catalog_id:
                        equiv_cell_line = 'Coriell:'+dna_ref
                        # some of the equivalent ids are not defined
                        # in the source data; so add them
                        gu.addIndividualToGraph(
                            g, equiv_cell_line, None, cell_line_reagent_id)
                        gu.addSameIndividual(g, cell_line_id, equiv_cell_line)

                    # Cell line derives from patient
                    geno.addDerivesFrom(cell_line_id, patient_id)
                    geno.addDerivesFrom(cell_line_id, cell_type)

                    # Cell line a member of repository
                    gu.addMember(g, repository, cell_line_id)

                    if cat_remark != '':
                        gu.addDescription(g, cell_line_id, cat_remark)

                    # Cell age_at_sampling
                    # TODO add the age nodes when modeled properly in #78
                    # if (age != ''):
                        # this would give a BNode that is an instance of Age.
                        # but i don't know how to connect
                        # the age node to the cell line? we need to ask @mbrush
                        # age_id = '_'+re.sub('\s+','_',age)
                        # gu.addIndividualToGraph(
                        #   g,age_id,age,self.terms['age'])
                        # gu.addTriple(
                        #   g,age_id,self.properties['has_measurement'],age,
                        #   True)

                    # #############    BUILD THE PATIENT    #############

                    # Add the patient ID as an individual.
                    gu.addPerson(g, patient_id, patient_label)
                    # TODO map relationship to proband as a class
                    # (what ontology?)

                    # Add race of patient
                    # FIXME: Adjust for subcategories based on ethnicity field
                    # EDIT: There are 743 different entries for ethnicity...
                    # Too many to map?
                    # Add ethnicity as literal in addition to the mapped race?
                    # Adjust the ethnicity txt (if using)
                    # to initial capitalization to remove ALLCAPS

                    # TODO race should go into the individual's background
                    # and abstracted out to the Genotype class punting for now.
                    # if race != '':
                    #    mapped_race = self._map_race(race)
                    #    if mapped_race is not None:
                    #        gu.addTriple(
                    #           g,patient_id,self.terms['race'],mapped_race)
                    #        gu.addSubclass(
                    #           g,self.terms['ethnic_group'],mapped_race)

                    # #############    BUILD THE FAMILY    #############

                    # Add triples for family_id, if present.
                    if family_id != '':
                        family_comp_id = 'CoriellFamily:'+family_id

                        family_label = \
                            ' '.join(('Family of proband with', short_desc))

                        # Add the family ID as a named individual
                        gu.addIndividualToGraph(
                            g, family_comp_id, family_label,
                            geno.genoparts['family'])

                        # Add the patient as a member of the family
                        gu.addMemberOf(g, patient_id, family_comp_id)

                    # #############    BUILD THE GENOTYPE   #############

                    # the important things to pay attention to here are:
                    # karyotype = chr rearrangements  (somatic?)
                    # mutation = protein-level mutation as a label,
                    # often from omim
                    # gene = gene symbol - TODO get id
                    # variant_id = omim variant ids (; delimited)
                    # dbsnp_id = snp individual ids = full genotype?

                    # note GM00633 is a good example of chromosomal variation
                    # - do we have enough to capture this?
                    # GM00325 has both abnormal karyotype and variation

                    # make an assumption that if the taxon is blank,
                    # that it is human!
                    if species is None or species == '':
                        species = 'H**o sapiens'
                    taxon = self._map_species(species)

                    # if there's a dbSNP id,
                    # this is actually the individual's genotype
                    genotype_id = None
                    genotype_label = None
                    if dbsnp_id != '':
                        genotype_id = 'dbSNPIndividual:'+dbsnp_id.strip()

                    omim_map = {}
                    gvc_id = None

                    # some of the karyotypes are encoded
                    # with terrible hidden codes. remove them here
                    # i've seen a <98> character
                    karyotype = du.remove_control_characters(karyotype)
                    karyotype_id = None
                    if karyotype.strip() != '':
                        karyotype_id = \
                            '_'+re.sub('MONARCH:', '', self.make_id(karyotype))
                        if self.nobnodes:
                            karyotype_id = ':'+karyotype_id
                        # add karyotype as karyotype_variation_complement
                        gu.addIndividualToGraph(
                            g, karyotype_id, karyotype,
                            geno.genoparts['karyotype_variation_complement'])
                        # TODO break down the karyotype into parts
                        # and map into GENO. depends on #77

                        # place the karyotype in a location(s).
                        karyo_chrs = \
                            self._get_affected_chromosomes_from_karyotype(
                                karyotype)
                        for c in karyo_chrs:
                            chr_id = makeChromID(c, taxon, 'CHR')
                            # add an anonymous sequence feature,
                            # each located on chr
                            karyotype_feature_id = '-'.join((karyotype_id, c))
                            karyotype_feature_label = \
                                'some karyotype alteration on chr'+str(c)
                            f = Feature(
                                karyotype_feature_id, karyotype_feature_label,
                                geno.genoparts['sequence_alteration'])
                            f.addFeatureStartLocation(None, chr_id)
                            f.addFeatureToGraph(g)
                            f.loadAllProperties(g)
                            geno.addParts(
                                karyotype_feature_id, karyotype_id,
                                geno.object_properties['has_alternate_part'])

                    if gene != '':
                        vl = gene+'('+mutation+')'

                    # fix the variant_id so it's always in the same order
                    vids = variant_id.split(';')
                    variant_id = ';'.join(sorted(list(set(vids))))

                    if karyotype.strip() != '' \
                            and not self._is_normal_karyotype(karyotype):
                        mutation = mutation.strip()
                        gvc_id = karyotype_id
                        if variant_id != '':
                            gvc_id = '_' + variant_id.replace(';', '-') + '-' \
                                    + re.sub(r'\w*:', '', karyotype_id)
                        if mutation.strip() != '':
                            gvc_label = '; '.join((vl, karyotype))
                        else:
                            gvc_label = karyotype
                    elif variant_id.strip() != '':
                        gvc_id = '_' + variant_id.replace(';', '-')
                        gvc_label = vl
                    else:
                        # wildtype?
                        pass

                    if gvc_id is not None and gvc_id != karyotype_id \
                            and self.nobnodes:
                        gvc_id = ':'+gvc_id

                    # add the karyotype to the gvc.
                    # use reference if normal karyotype
                    karyo_rel = geno.object_properties['has_alternate_part']
                    if self._is_normal_karyotype(karyotype):
                        karyo_rel = \
                            geno.object_properties['has_reference_part']
                    if karyotype_id is not None \
                            and not self._is_normal_karyotype(karyotype) \
                            and gvc_id is not None and karyotype_id != gvc_id:
                        geno.addParts(karyotype_id, gvc_id, karyo_rel)

                    if variant_id.strip() != '':
                        # split the variants & add them as part of the genotype
                        # we don't necessarily know their zygosity,
                        # just that they are part of the genotype variant ids
                        # are from OMIM, so prefix as such we assume that the
                        # sequence alts will be defined in OMIM not here
                        # TODO sort the variant_id list, if the omim prefix is
                        # the same, then assume it's the locus make a hashmap
                        # of the omim id to variant id list;
                        # then build the genotype hashmap is also useful for
                        # removing the "genes" from the list of "phenotypes"

                        # will hold gene/locus id to variant list
                        omim_map = {}

                        locus_num = None
                        for v in variant_id.split(';'):
                            # handle omim-style and odd var ids
                            # like 610661.p.R401X
                            m = re.match(r'(\d+)\.+(.*)', v.strip())
                            if m is not None and len(m.groups()) == 2:
                                (locus_num, var_num) = m.groups()

                            if locus_num is not None \
                                    and locus_num not in omim_map:
                                omim_map[locus_num] = [var_num]
                            else:
                                omim_map[locus_num] += [var_num]

                        for o in omim_map:
                            # gene_id = 'OMIM:' + o  # TODO unused
                            vslc_id = \
                                '_' + '-'.join(
                                    [o + '.' + a for a in omim_map.get(o)])
                            if self.nobnodes:
                                vslc_id = ':'+vslc_id
                            vslc_label = vl
                            # we don't really know the zygosity of
                            # the alleles at all.
                            # so the vslcs are just a pot of them
                            gu.addIndividualToGraph(
                                g, vslc_id, vslc_label,
                                geno.genoparts[
                                    'variant_single_locus_complement'])
                            for v in omim_map.get(o):
                                # this is actually a sequence alt
                                allele1_id = 'OMIM:'+o+'.'+v
                                geno.addSequenceAlteration(allele1_id, None)

                                # assume that the sa -> var_loc -> gene
                                # is taken care of in OMIM
                                geno.addPartsToVSLC(
                                    vslc_id, allele1_id, None,
                                    geno.zygosity['indeterminate'],
                                    geno.object_properties[
                                        'has_alternate_part'])

                            if vslc_id != gvc_id:
                                geno.addVSLCtoParent(vslc_id, gvc_id)

                    if affected == 'unaffected':
                        # let's just say that this person is wildtype
                        gu.addType(g, patient_id, geno.genoparts['wildtype'])
                    elif genotype_id is None:
                        # make an anonymous genotype id
                        genotype_id = '_geno'+catalog_id.strip()
                        if self.nobnodes:
                            genotype_id = ':'+genotype_id

                    # add the gvc
                    if gvc_id is not None:
                        gu.addIndividualToGraph(
                            g, gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])

                        # add the gvc to the genotype
                        if genotype_id is not None:
                            if affected == 'unaffected':
                                rel = \
                                    geno.object_properties[
                                        'has_reference_part']
                            else:
                                rel = \
                                    geno.object_properties[
                                        'has_alternate_part']
                            geno.addParts(gvc_id, genotype_id, rel)
                        if karyotype_id is not None \
                                and self._is_normal_karyotype(karyotype):
                            if gvc_label is not None and gvc_label != '':
                                genotype_label = \
                                    '; '.join((gvc_label, karyotype))
                            else:
                                genotype_label = karyotype
                            if genotype_id is None:
                                genotype_id = karyotype_id
                            else:
                                geno.addParts(
                                    karyotype_id, genotype_id,
                                    geno.object_properties[
                                        'has_reference_part'])
                        else:
                            genotype_label = gvc_label
                            # use the catalog id as the background
                        genotype_label += ' ['+catalog_id.strip()+']'

                    if genotype_id is not None and gvc_id is not None:
                        # only add the genotype if it has some parts
                        geno.addGenotype(
                            genotype_id, genotype_label,
                            geno.genoparts['intrinsic_genotype'])
                        geno.addTaxon(taxon, genotype_id)
                        # add that the patient has the genotype
                        # TODO check if the genotype belongs to
                        # the cell line or to the patient
                        gu.addTriple(
                            g, patient_id,
                            geno.properties['has_genotype'], genotype_id)
                    else:
                        geno.addTaxon(taxon, patient_id)

                    # TODO: Add sex/gender  (as part of the karyotype?)

                    # #############    DEAL WITH THE DISEASES   #############

                    # we associate the disease to the patient
                    if affected == 'affected':
                        if omim_number != '':
                            for d in omim_number.split(';'):
                                if d is not None and d != '':
                                    # if the omim number is in omim_map,
                                    # then it is a gene not a pheno
                                    if d not in omim_map:
                                        disease_id = 'OMIM:'+d.strip()
                                        # assume the label is taken care of
                                        gu.addClassToGraph(g, disease_id, None)

                                        # add the association:
                                        #   the patient has the disease
                                        assoc = G2PAssoc(
                                            self.name, patient_id, disease_id)
                                        assoc.add_association_to_graph(g)

                                        # this line is a model of this disease
                                        # TODO abstract out model into
                                        # it's own association class?
                                        gu.addTriple(
                                            g, cell_line_id,
                                            gu.properties['model_of'],
                                            disease_id)
                                    else:
                                        logger.info(
                                            'removing %s from disease list ' +
                                            'since it is a gene', d)

                    # #############    ADD PUBLICATIONS   #############

                    if pubmed_ids != '':
                        for s in pubmed_ids.split(';'):
                            pubmed_id = 'PMID:'+s.strip()
                            ref = Reference(pubmed_id)
                            ref.setType(Reference.ref_types['journal_article'])
                            ref.addRefToGraph(g)
                            gu.addTriple(
                                g, pubmed_id, gu.properties['mentions'],
                                cell_line_id)

                    if not self.testMode \
                            and (limit is not None and line_counter > limit):
                        break

            Assoc(self.name).load_all_properties(g)

        return

Пример #7

Показать файл

Файл: Association.py Проект: JervenBolleman/dipper

class Assoc:
    """
    An abstract class for OBAN (Monarch)-style associations,
    to enable attribution of source and evidence
    on statements.
    """

    assoc_types = {
        'association': 'OBAN:association'
    }

    annotation_properties = {
        'replaced_by': 'IAO:0100001',
        'consider': 'OIO:consider',
        'hasExactSynonym': 'OIO:hasExactSynonym',
        'hasRelatedSynonym': 'OIO:hasRelatedSynonym',
        'definition': 'IAO:0000115',
        'has_xref': 'OIO:hasDbXref',
    }

    object_properties = {
        'has_disposition': 'GENO:0000208',
        'has_phenotype': 'RO:0002200',
        'in_taxon': 'RO:0002162',
        'has_quality': 'RO:0000086',
        'towards': 'RO:0002503',
        'has_subject': 'OBAN:association_has_subject',
        'has_object': 'OBAN:association_has_object',
        'has_predicate': 'OBAN:association_has_object_property',
        'is_about': 'IAO:00000136',
        'has_evidence': 'RO:0002558',
        'has_source': 'dc:source',
        'has_provenance': 'OBAN:has_provenance'
    }

    datatype_properties = {
        'position': 'faldo:position',
        'has_measurement': 'IAO:0000004'
    }

    properties = annotation_properties.copy()
    properties.update(object_properties)
    properties.update(datatype_properties)

    OWLCLASS = OWL['Class']
    OWLIND = OWL['NamedIndividual']
    OBJECTPROP = OWL['ObjectProperty']
    ANNOTPROP = OWL['AnnotationProperty']
    DATAPROP = OWL['DatatypeProperty']

    SUBCLASS = RDFS['subClassOf']
    BASE = Namespace(curie_map.get()[''])

    def __init__(self, definedby):
        self.cu = CurieUtil(curie_map.get())
        self.gu = GraphUtils(curie_map.get())

        # core parts of the association
        self.definedby = definedby
        self.sub = self.obj = self.rel = None
        self.assoc_id = None

        self.description = None
        self.source = []
        self.evidence = []
        # this is going to be used for the refactored evidence/provenance
        self.provenance = []

        self.score = None
        self.score_type = None
        self.score_unit = None

        return

    def get_properties(self):
        return self.properties

    def _is_valid(self):

        # check if sub/obj/rel are none...throw error
        if self.sub is None:
            raise ValueError('No subject set for this association')
        if self.obj is None:
            raise ValueError('No object set for this association')
        if self.rel is None:
            raise ValueError('No relation set for this association')

        return True

    def _add_basic_association_to_graph(self, g):

        if not self._is_valid():
            return

        # first, add the direct triple
        # anonymous (blank) nodes are indicated with underscore
        s = self.gu.getNode(self.sub)
        o = self.gu.getNode(self.obj)
        p = self.gu.getNode(self.rel)

        if s is None:
            logging.error(
                "Unable to retrieve graph node for Subject %s ", self.sub)
            return
                
        elif p is None:
            logging.error(
                "Unable to retrieve graph node for Predicate %s ", self.rel)
            return
                
        elif o is None:
            logging.error(
                "Unable to retrieve graph node for Object %s ", self.obj)
            return
        else:
            g.add((s, p, o))

        if self.assoc_id is None:
            self.set_association_id()

        node = self.gu.getNode(self.assoc_id)
        g.add((node, RDF['type'],
               self.gu.getNode(self.assoc_types['association'])))

        self.gu.addTriple(g, self.assoc_id,
                          self.object_properties['has_subject'], self.sub)
        self.gu.addTriple(g, self.assoc_id,
                          self.object_properties['has_object'], self.obj)
        self.gu.addTriple(g, self.assoc_id,
                          self.object_properties['has_predicate'], self.rel)

        if self.description is not None:
            self.gu.addDescription(g, self.assoc_id, self.description)

        if self.evidence is not None and len(self.evidence) > 0:
            for e in self.evidence:
                self.gu.addTriple(g, self.assoc_id,
                                  self.object_properties['has_evidence'], e)

        if self.source is not None and len(self.source) > 0:
            for s in self.source:
                if re.match('http', s):
                    # TODO assume that the source is a publication?
                    # use Reference class here
                    self.gu.addTriple(g, self.assoc_id,
                                      self.object_properties['has_source'], s,
                                      True)
                else:
                    self.gu.addTriple(g, self.assoc_id,
                                      self.object_properties['has_source'], s)

        if self.provenance is not None and len(self.provenance) > 0:
            for p in self.provenance:
                self.gu.addTriple(g, self.assoc_id,
                                  self.object_properties['has_provenance'], p)

        if self.score is not None:
            self.gu.addTriple(
                g, self.assoc_id, self.properties['has_measurement'],
                Literal(self.score, datatype=XSD['float']), True)
            # TODO
            # update with some kind of instance of scoring object
            # that has a unit and type

        return

    def add_association_to_graph(self, g):

        self._add_basic_association_to_graph(g)

        return

    def set_subject(self, identifier):
        self.sub = identifier
        return

    def set_object(self, identifier):
        self.obj = identifier
        return

    def set_relationship(self, identifier):
        self.rel = identifier
        return

    def set_association_id(self, assoc_id=None):
        """
        This will set the association ID based on the internal parts
            of the association.
        To be used in cases where an external association identifier
            should be used.

        :param assoc_id:
        :return:
        """
        if assoc_id is None:
            self.assoc_id = self.make_association_id(self.definedby, self.sub,
                                                     self.rel, self.obj)
        else:
            self.assoc_id = assoc_id

        return

    def get_association_id(self):

        return self.assoc_id

    def set_description(self, description):
        self.description = description

        return

    def set_score(self, score, unit=None, score_type=None):

        self.score = score
        self.score_unit = unit
        self.score_type = score_type

        return

    def add_evidence(self, identifier):
        """
        Add an evidence code to the association object (maintained as a list)
        :param identifier:
        :return:
        """

        if identifier is not None and identifier.strip() != '':
            self.evidence += [identifier]

        return

    def add_source(self, identifier):
        """
        Add a source identifier (such as publication id)
        to the association object (maintained as a list)
        TODO we need to greatly expand this function!

        :param identifier:
        :return:
        """

        if identifier is not None and identifier.strip() != '':
            self.source += [identifier]

        return

    def add_provenance(self, identifier):

        if identifier is not None and identifier.strip() != '':
            self.provenance += [identifier]

        return

    def load_all_properties(self, g):
        props = {
            self.OBJECTPROP: self.object_properties,
            self.ANNOTPROP: self.annotation_properties,
            self.DATAPROP: self.datatype_properties
        }

        for p in props:
            self.gu.loadProperties(g, props[p], p)

        return

    def _get_source_uri(self, pub_id):
        """
        Given some kind of pub_id (which might be a CURIE or url),
        convert it into a proper node.

        :param pub_id:
        :return: source: Well-formed URI for the given identifier (or url)
        """

        source = None
        if re.compile('http').match(pub_id):
            source = URIRef(pub_id)
        else:
            u = self.gu.getNode(pub_id)
            if u is not None:
                source = URIRef(u)
            else:
                logger.error(
                    "An id we don't know how to deal with: %s", pub_id)

        return source

    @staticmethod
    def make_association_id(definedby, subject, predicate, object,
                            attributes=None):
        """
        A method to create unique identifiers for OBAN-style associations,
        based on all the parts of the association
        If any of the items is empty or None, it will convert it to blank.
        It effectively md5 hashes the (+)-joined string from the values.
        Subclasses of Assoc can submit an additional array of attributes
        that will be added to the ID.

        :param definedby: The (data) resource that provided the annotation
        :param subject:
        :param predicate:
        :param object:
        :param attributes:
        :return:
        """

        # note others available:
        #   md5(), sha1(), sha224(), sha256(), sha384(), and sha512()
        # TEC: at our scale, md5 is in danger of having collisions.
        # putting definedby first,
        # as this will usually be the datasource providing the annotation
        # this will end up making the first few parts of the id
        # be the same for all annotations in that resource
        items_to_hash = [definedby, subject, predicate, object]
        if attributes is not None:
            items_to_hash += attributes

        for i, val in enumerate(items_to_hash):
            if val is None:
                items_to_hash[i] = ''

        byte_string = '+'.join(items_to_hash).encode("utf-8")

        # TODO put this in a util?
        return ':'.join(('MONARCH', hashlib.md5(byte_string).hexdigest()))

Пример #8

Показать файл

Файл: KEGG.py Проект: JervenBolleman/dipper

    def _process_ortholog_classes(self, limit=None):
        """
        This method add the KEGG orthology classes to the graph.

        If there's an embedded enzyme commission number,
        that is added as an xref.

        Triples created:
        <orthology_class_id> is a class
        <orthology_class_id> has label <orthology_symbols>
        <orthology_class_id> has description <orthology_description>
        :param limit:

        :return:
        """

        logger.info("Processing ortholog classes")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (orthology_class_id, orthology_class_name) = row

                if self.testMode and \
                        orthology_class_id not in \
                        self.test_ids['orthology_classes']:
                    continue

                # The orthology class is essentially a KEGG gene ID
                # that is species agnostic.
                # Add the ID and label as a gene family class

                other_labels = re.split(r'[;,]', orthology_class_name)
                # the first one is the label we'll use
                orthology_label = other_labels[0]

                orthology_class_id = 'KEGG-'+orthology_class_id.strip()

                orthology_type = OrthologyAssoc.terms['gene_family']
                gu.addClassToGraph(g, orthology_class_id, orthology_label,
                                   orthology_type)
                if len(other_labels) > 1:
                    # add the rest as synonyms
                    # todo skip the first
                    for s in other_labels:
                        gu.addSynonym(g, orthology_class_id, s.strip())

                    # add the last one as the description
                    d = other_labels[len(other_labels)-1]
                    gu.addDescription(g, orthology_class_id, d)

                    # add the enzyme commission number (EC:1.2.99.5)as an xref
                    # sometimes there's two, like [EC:1.3.5.1 1.3.5.4]
                    # can also have a dash, like EC:1.10.3.-
                    ec_matches = re.findall(r'((?:\d+|\.|-){5,7})', d)
                    if ec_matches is not None:
                        for ecm in ec_matches:
                            gu.addXref(g, orthology_class_id, 'EC:'+ecm)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        logger.info("Done with ortholog classes")
        return

Пример #9

Показать файл

Файл: IMPC.py Проект: JervenBolleman/dipper

    def _process_data(self, raw, limit=None):
        logger.info("Processing Data from %s", raw)
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)
        line_counter = 0
        gu.loadAllProperties(g)
        gu.loadObjectProperties(g, geno.object_properties)

        # Add the taxon as a class
        taxon_id = 'NCBITaxon:10090'  # map to Mus musculus
        gu.addClassToGraph(g, taxon_id, None)

        # with open(raw, 'r', encoding="utf8") as csvfile:
        with gzip.open(raw, 'rt') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                line_counter += 1

                (marker_accession_id, marker_symbol, phenotyping_center,
                 colony, sex, zygosity, allele_accession_id, allele_symbol,
                 allele_name, strain_accession_id, strain_name, project_name,
                 project_fullname, pipeline_name, pipeline_stable_id,
                 procedure_stable_id, procedure_name, parameter_stable_id,
                 parameter_name, top_level_mp_term_id, top_level_mp_term_name,
                 mp_term_id, mp_term_name, p_value, percentage_change,
                 effect_size, statistical_method, resource_name) = row

                if self.testMode and marker_accession_id not in self.test_ids:
                    continue

                # ##### cleanup some of the identifiers ######
                zygosity_id = self._map_zygosity(zygosity)

                # colony ids sometimes have <> in them, spaces,
                # or other non-alphanumerics and break our system;
                # replace these with underscores
                colony_id = '_'+re.sub(r'\W+', '_', colony)
                if self.nobnodes:
                    colony_id = ':'+colony_id

                if not re.match(r'MGI', allele_accession_id):
                    allele_accession_id = \
                        '_IMPC-'+re.sub(r':', '', allele_accession_id)
                    if self.nobnodes:
                        allele_accession_id = ':'+allele_accession_id
                if re.search(r'EUROCURATE', strain_accession_id):
                    # the eurocurate links don't resolve at IMPC
                    strain_accession_id = '_'+strain_accession_id
                    if self.nobnodes:
                        strain_accession_id = ':'+strain_accession_id
                elif not re.match(r'MGI', strain_accession_id):
                    logger.info(
                        "Found a strange strain accession...%s",
                        strain_accession_id)
                    strain_accession_id = 'IMPC:'+strain_accession_id

                ######################
                # first, add the marker and variant to the graph as with MGI,
                # the allele is the variant locus.  IF the marker is not known,
                # we will call it a sequence alteration.  otherwise,
                # we will create a BNode for the sequence alteration.
                sequence_alteration_id = variant_locus_id = None
                variant_locus_name = sequence_alteration_name = None

                # extract out what's within the <> to get the symbol
                if re.match(r'.*<.*>', allele_symbol):
                    sequence_alteration_name = \
                        re.match(r'.*<(.*)>', allele_symbol).group(1)
                else:
                    sequence_alteration_name = allele_symbol

                if marker_accession_id is not None and \
                        marker_accession_id == '':
                    logger.warning(
                        "Marker unspecified on row %d", line_counter)
                    marker_accession_id = None

                if marker_accession_id is not None:
                    variant_locus_id = allele_accession_id
                    variant_locus_name = allele_symbol
                    variant_locus_type = geno.genoparts['variant_locus']
                    geno.addGene(marker_accession_id, marker_symbol,
                                 geno.genoparts['gene'])
                    geno.addAllele(variant_locus_id, variant_locus_name,
                                   variant_locus_type, None)
                    geno.addAlleleOfGene(variant_locus_id, marker_accession_id)

                    sequence_alteration_id = \
                        '_seqalt'+re.sub(r':', '', allele_accession_id)
                    if self.nobnodes:
                        sequence_alteration_id = ':'+sequence_alteration_id
                    geno.addSequenceAlterationToVariantLocus(
                        sequence_alteration_id, variant_locus_id)

                else:
                    sequence_alteration_id = allele_accession_id

                # IMPC contains targeted mutations with either gene traps,
                # knockouts, insertion/intragenic deletions.
                # but I don't really know what the SeqAlt is here,
                # so I don't add it.
                geno.addSequenceAlteration(sequence_alteration_id,
                                           sequence_alteration_name)

                # #############    BUILD THE COLONY    #############
                # First, let's describe the colony that the animals come from
                # The Colony ID refers to the ES cell clone
                #   used to generate a mouse strain.
                # Terry sez: we use this clone ID to track
                #   ES cell -> mouse strain -> mouse phenotyping.
                # The same ES clone maybe used at multiple centers,
                # so we have to concatenate the two to have a unique ID.
                # some useful reading about generating mice from ES cells:
                # http://ki.mit.edu/sbc/escell/services/details

                # here, we'll make a genotype
                # that derives from an ES cell with a given allele.
                # the strain is not really attached to the colony.

                # the colony/clone is reflective of the allele,
                # with unknown zygosity
                stem_cell_class = 'ERO:0002002'
                gu.addIndividualToGraph(g, colony_id, colony, stem_cell_class)

                # vslc of the colony has unknown zygosity
                # note that we will define the allele
                # (and it's relationship to the marker, etc.) later
                # FIXME is it really necessary to create this vslc
                # when we always know it's unknown zygosity?
                vslc_colony = \
                    '_'+allele_accession_id+geno.zygosity['indeterminate']
                vslc_colony = re.sub(r':', '', vslc_colony)
                if self.nobnodes:
                    vslc_colony = ':'+vslc_colony
                vslc_colony_label = allele_symbol+'/<?>'
                # for ease of reading, we make the colony genotype variables.
                # in the future, it might be desired to keep the vslcs
                colony_genotype_id = vslc_colony
                colony_genotype_label = vslc_colony_label
                geno.addGenotype(colony_genotype_id, colony_genotype_label)
                geno.addParts(allele_accession_id, colony_genotype_id,
                              geno.object_properties['has_alternate_part'])
                geno.addPartsToVSLC(
                    vslc_colony, allele_accession_id, None,
                    geno.zygosity['indeterminate'],
                    geno.object_properties['has_alternate_part'])
                gu.addTriple(
                    g, colony_id,
                    geno.object_properties['has_genotype'],
                    colony_genotype_id)

                # ##########    BUILD THE ANNOTATED GENOTYPE    ##########
                # now, we'll build the genotype of the individual that derives
                # from the colony/clone genotype that is attached to
                # phenotype = colony_id + strain + zygosity + sex
                # (and is derived from a colony)

                # this is a sex-agnostic genotype
                genotype_id = \
                    self.make_id(
                        (colony_id + phenotyping_center + zygosity +
                         strain_accession_id))
                geno.addSequenceDerivesFrom(genotype_id, colony_id)

                # build the VSLC of the sex-agnostic genotype
                # based on the zygosity
                allele1_id = allele_accession_id
                allele2_id = allele2_rel = None
                allele1_label = allele_symbol
                allele2_label = '<?>'
                # Making VSLC labels from the various parts,
                # can change later if desired.
                if zygosity == 'heterozygote':
                    allele2_label = re.sub(r'<.*', '<+>', allele1_label)
                    allele2_id = None
                elif zygosity == 'homozygote':
                    allele2_label = allele1_label
                    allele2_id = allele1_id
                    allele2_rel = geno.object_properties['has_alternate_part']
                elif zygosity == 'hemizygote':
                    allele2_label = re.sub(r'<.*', '<0>', allele1_label)
                    allele2_id = None
                elif zygosity == 'not_applicable':
                    allele2_label = re.sub(r'<.*', '<?>', allele1_label)
                    allele2_id = None
                else:
                    logger.warning("found unknown zygosity %s", zygosity)
                    break
                vslc_name = '/'.join((allele1_label, allele2_label))

                # Add the VSLC
                vslc_id = '_' + '-'.join((marker_accession_id,
                                          allele_accession_id, zygosity))
                vslc_id = re.sub(r':', '', vslc_id)
                if self.nobnodes:
                    vslc_id = ':'+vslc_id
                gu.addIndividualToGraph(
                    g, vslc_id, vslc_name,
                    geno.genoparts['variant_single_locus_complement'])
                geno.addPartsToVSLC(
                    vslc_id, allele1_id, allele2_id, zygosity_id,
                    geno.object_properties['has_alternate_part'],
                    allele2_rel)

                # add vslc to genotype
                geno.addVSLCtoParent(vslc_id, genotype_id)

                # note that the vslc is also the gvc
                gu.addType(
                    g, vslc_id,
                    Genotype.genoparts['genomic_variation_complement'])

                # Add the genomic background
                # create the genomic background id and name
                if strain_accession_id != '':
                    genomic_background_id = strain_accession_id
                else:
                    genomic_background_id = None

                genotype_name = vslc_name
                if genomic_background_id is not None:
                    geno.addGenotype(
                        genomic_background_id, strain_name,
                        geno.genoparts['genomic_background'])

                    # make a phenotyping-center-specific strain
                    # to use as the background
                    pheno_center_strain_label = \
                        strain_name + '/' + phenotyping_center
                    pheno_center_strain_id = \
                        '-'.join((re.sub(r':', '', genomic_background_id),
                                  re.sub(r'\s', '_', phenotyping_center)))
                    if not re.match(r'^_', pheno_center_strain_id):
                        pheno_center_strain_id = '_'+pheno_center_strain_id
                    if self.nobnodes:
                        pheno_center_strain_id = ':'+pheno_center_strain_id
                    geno.addGenotype(pheno_center_strain_id,
                                     pheno_center_strain_label,
                                     geno.genoparts['genomic_background'])
                    geno.addSequenceDerivesFrom(pheno_center_strain_id,
                                                genomic_background_id)

                    # Making genotype labels from the various parts,
                    # can change later if desired.
                    # since the genotype is reflective of the place
                    # it got made, should put that in to disambiguate
                    genotype_name = \
                        genotype_name+' ['+pheno_center_strain_label+']'
                    geno.addGenomicBackgroundToGenotype(
                        pheno_center_strain_id, genotype_id)
                    geno.addTaxon(pheno_center_strain_id, taxon_id)
                # this is redundant, but i'll keep in in for now
                geno.addSequenceDerivesFrom(genotype_id, colony_id)
                genotype_name += '['+colony+']'
                geno.addGenotype(genotype_id, genotype_name)

                # Make the sex-qualified genotype,
                # which is what the phenotype is associated with
                sex_qualified_genotype_id = \
                    self.make_id(
                        (colony_id + phenotyping_center + zygosity +
                         strain_accession_id+sex))
                sex_qualified_genotype_label = genotype_name+' ('+sex+')'
                if sex == 'male':
                    sq_type_id = geno.genoparts['male_genotype']
                elif sex == 'female':
                    sq_type_id = geno.genoparts['female_genotype']
                else:
                    sq_type_id = geno.genoparts['sex_qualified_genotype']

                geno.addGenotype(
                    sex_qualified_genotype_id,
                    sex_qualified_genotype_label, sq_type_id)
                geno.addParts(
                    genotype_id, sex_qualified_genotype_id,
                    geno.object_properties['has_alternate_part'])

                if genomic_background_id is not None and \
                        genomic_background_id != '':
                    # Add the taxon to the genomic_background_id
                    geno.addTaxon(taxon_id, genomic_background_id)
                else:
                    # add it as the genomic background
                    geno.addTaxon(taxon_id, genotype_id)

                # #############    BUILD THE G2P ASSOC    #############
                # from an old email dated July 23 2014:
                # Phenotypes associations are made to
                # imits colony_id+center+zygosity+gender

                phenotype_id = mp_term_id

                # it seems that sometimes phenotype ids are missing.
                # indicate here
                if phenotype_id is None or phenotype_id == '':
                    logger.warning(
                        "No phenotype id specified for row %d: %s",
                        line_counter, str(row))
                    continue
                # experimental_phenotypic_evidence This was used in ZFIN
                eco_id = "ECO:0000059"

                # the association comes as a result of a g2p from
                # a procedure in a pipeline at a center and parameter tested

                assoc = G2PAssoc(self.name, sex_qualified_genotype_id,
                                 phenotype_id)
                assoc.add_evidence(eco_id)
                # assoc.set_score(float(p_value))

                # TODO add evidence instance using
                # pipeline_stable_id +
                # procedure_stable_id +
                # parameter_stable_id

                assoc.add_association_to_graph(g)
                assoc_id = assoc.get_association_id()

                # add a free-text description
                description = \
                    ' '.join((mp_term_name, 'phenotype determined by',
                              phenotyping_center, 'in an',
                              procedure_name, 'assay where',
                              parameter_name.strip(),
                              'was measured with an effect_size of',
                              str(round(float(effect_size), 5)),
                              '(p =', "{:.4e}".format(float(p_value)), ').'))

                gu.addDescription(g, assoc_id, description)

                # TODO add provenance information
                # resource_id = resource_name
                # assoc.addSource(g, assoc_id, resource_id)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP)
        gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP)

        return

Пример #10

Показать файл

Файл: OMIA.py Проект: JervenBolleman/dipper

class OMIA(Source):
    """
    This is the parser for the
    [Online Mendelian Inheritance in Animals
    (OMIA)](http://www.http://omia.angis.org.au),
    from which we process inherited disorders, other (single-locus) traits,
    and genes in >200 animal species (other than human and mouse and rats).

    We generate the omia graph to include the following information:
    * genes
    * animal taxonomy, and breeds as instances of those taxa
        (breeds are akin to "strains" in other taxa)
    * animal diseases, along with species-specific subtypes of those diseases
    * publications (and their mapping to PMIDs, if available)
    * gene-to-phenotype associations (via an anonymous variant-locus
    * breed-to-phenotype associations

    We make links between OMIA and OMIM in two ways:
    1.  mappings between OMIA and OMIM are created as OMIA --> hasdbXref OMIM
    2.  mappings between a breed and OMIA disease are created
        to be a model for the mapped OMIM disease,
        IF AND ONLY IF it is a 1:1 mapping.
        there are some 1:many mappings,
        and these often happen if the OMIM item is a gene.

    Because many of these species are not covered in
    the PANTHER orthology datafiles, we also pull any orthology
    relationships from the gene_group files from NCBI.

    """

    files = {
        'data': {
            'file': 'omia.xml.gz',
            'url': 'http://omia.angis.org.au/dumps/omia.xml.gz'},
    }

    def __init__(self):
        Source.__init__(self, 'omia')

        self.load_bindings()

        self.dataset = Dataset(
            'omia', 'Online Mendelian Inheritance in Animals',
            'http://omia.angis.org.au', None, None,
            'http://sydney.edu.au/disclaimer.shtml')

        self.id_hash = {
            'article': {},
            'phene': {},
            'breed': {},
            'taxon': {},
            'gene': {}
        }
        self.label_hash = {}
        self.gu = GraphUtils(curie_map.get())
        # used to store the omia to omim phene mappings
        self.omia_omim_map = {}
        # used to store the unique genes that have phenes
        # (for fetching orthology)
        self.annotated_genes = set()

        self.test_ids = {
            'disease': [
                'OMIA:001702', 'OMIA:001867', 'OMIA:000478', 'OMIA:000201',
                'OMIA:000810', 'OMIA:001400'],
            'gene': [
                492297, 434, 492296, 3430235, 200685834, 394659996, 200685845,
                28713538, 291822383],
            'taxon': [9691, 9685, 9606, 9615, 9913, 93934, 37029, 9627, 9825],
            # to be filled in during parsing of breed table
            # for lookup by breed-associations
            'breed': []
        }
        # to store a map of omia ids and any molecular info
        # to write a report for curation
        self.stored_omia_mol_gen = {}
        self.g = self.graph
        self.geno = Genotype(self.g)
        return

    def fetch(self, is_dl_forced=False):
        """
        :param is_dl_forced:
        :return:
        """
        self.get_files(is_dl_forced)

        ncbi = NCBIGene()
        # ncbi.fetch()
        gene_group = ncbi.files['gene_group']
        self.fetch_from_url(
            gene_group['url'], '/'.join((ncbi.rawdir, gene_group['file'])),
            False)

        return

    def parse(self, limit=None):
        # names of tables to iterate - probably don't need all these:
        # Article_Breed, Article_Keyword, Article_Gene, Article_Keyword,
        # Article_People, Article_Phene, Articles, Breed, Breed_Phene,
        # Genes_gb, Group_Categories, Group_MPO, Inherit_Type, Keywords,
        # Landmark, Lida_Links, OMIA_Group, OMIA_author, Omim_Xref, People,
        # Phene, Phene_Gene, Publishers, Resources, Species_gb, Synonyms

        self.scrub()

        if limit is not None:
            logger.info("Only parsing first %d rows", limit)

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        if self.testMode:
            self.g = self.testgraph
        else:
            self.g = self.graph
        self.geno = Genotype(self.g)

        # we do three passes through the file
        # first process species (two others reference this one)
        self.process_species(limit)

        # then, process the breeds, genes, articles, and other static stuff
        self.process_classes(limit)

        # next process the association data
        self.process_associations(limit)

        # process the vertebrate orthology for genes
        # that are annotated with phenotypes
        ncbi = NCBIGene()
        ncbi.add_orthologs_by_gene_group(self.g, self.annotated_genes)

        self.load_core_bindings()
        self.load_bindings()

        logger.info("Done parsing.")

        self.write_molgen_report()

        return

    def scrub(self):
        """
        The XML file seems to have mixed-encoding;
        we scrub out the control characters
        from the file for processing.
        :return:

        """

        logger.info(
            "Scrubbing out the nasty characters that break our parser.")

        myfile = '/'.join((self.rawdir, self.files['data']['file']))
        tmpfile = '/'.join((self.rawdir, self.files['data']['file']+'.tmp.gz'))
        t = gzip.open(tmpfile, 'wb')
        du = DipperUtil()
        with gzip.open(myfile, 'rb') as f:
            filereader = io.TextIOWrapper(f, newline="")
            for l in filereader:
                l = du.remove_control_characters(l) + '\n'
                t.write(l.encode('utf-8'))
        t.close()

        # move the temp file
        logger.info("Replacing the original data with the scrubbed file.")
        shutil.move(tmpfile, myfile)
        return

    # ###################### XML LOOPING FUNCTIONS ##################

    def process_species(self, limit):
        """
        Loop through the xml file and process the species.
        We add elements to the graph, and store the
        id-to-label in the label_hash dict.
        :param limit:
        :return:
        """

        myfile = '/'.join((self.rawdir, self.files['data']['file']))

        f = gzip.open(myfile, 'rb')
        filereader = io.TextIOWrapper(f, newline="")

        filereader.readline()  # remove the xml declaration line

        for event, elem in ET.iterparse(filereader):
            # Species ids are == genbank species ids!
            self.process_xml_table(
                elem, 'Species_gb', self._process_species_table_row, limit)

        f.close()

        return

    def process_classes(self, limit):
        """
        Loop through the xml file and process the articles,
        breed, genes, phenes, and phenotype-grouping classes.
        We add elements to the graph,
        and store the id-to-label in the label_hash dict,
        along with the internal key-to-external id in the id_hash dict.
        The latter are referenced in the association processing functions.

        :param limit:
        :return:

        """

        myfile = '/'.join((self.rawdir, self.files['data']['file']))

        f = gzip.open(myfile, 'rb')
        filereader = io.TextIOWrapper(f, newline="")

        filereader.readline()  # remove the xml declaration line

        parser = ET.XMLParser(encoding='utf-8')

        for event, elem in ET.iterparse(filereader, parser=parser):
            self.process_xml_table(
                elem, 'Articles', self._process_article_row, limit)
            self.process_xml_table(
                elem, 'Breed', self._process_breed_row, limit)
            self.process_xml_table(
                elem, 'Genes_gb', self._process_gene_row, limit)
            self.process_xml_table(
                elem, 'OMIA_Group', self._process_omia_group_row, limit)
            self.process_xml_table(
                elem, 'Phene', self._process_phene_row, limit)
            self.process_xml_table(
                elem, 'Omim_Xref', self._process_omia_omim_map, limit)

        f.close()

        # post-process the omia-omim associations to filter out the genes
        # (keep only phenotypes/diseases)
        self.clean_up_omim_genes()

        return

    def process_associations(self, limit):
        """
        Loop through the xml file and process the article-breed, article-phene,
        breed-phene, phene-gene associations, and the external links to LIDA.

        :param limit:
        :return:

        """

        myfile = '/'.join((self.rawdir, self.files['data']['file']))

        f = gzip.open(myfile, 'rb')
        filereader = io.TextIOWrapper(f, newline="")

        filereader.readline()  # remove the xml declaration line

        for event, elem in ET.iterparse(filereader):
            self.process_xml_table(
                elem, 'Article_Breed', self._process_article_breed_row, limit)
            self.process_xml_table(
                elem, 'Article_Phene', self._process_article_phene_row, limit)
            self.process_xml_table(
                elem, 'Breed_Phene', self._process_breed_phene_row, limit)
            self.process_xml_table(
                elem, 'Lida_Links', self._process_lida_links_row, limit)
            self.process_xml_table(
                elem, 'Phene_Gene', self._process_phene_gene_row, limit)
            self.process_xml_table(
                elem, 'Group_MPO', self._process_group_mpo_row, limit)

        f.close()

        return

    # ############ INDIVIDUAL TABLE-LEVEL PROCESSING FUNCTIONS ################

    def _process_species_table_row(self, row):
        # gb_species_id, sci_name, com_name, added_by, date_modified
        tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
        sci_name = row['sci_name']
        com_name = row['com_name']

        if self.testMode and \
                (int(row['gb_species_id']) not in self.test_ids['taxon']):
            return

        self.gu.addClassToGraph(self.g, tax_id, sci_name)
        if com_name != '':
            self.gu.addSynonym(self.g, tax_id, com_name)
            self.label_hash[tax_id] = com_name  # for lookup later
        else:
            self.label_hash[tax_id] = sci_name

        return

    def _process_breed_row(self, row):

        # in test mode, keep all breeds of our test species
        if self.testMode and \
                (int(row['gb_species_id']) not in self.test_ids['taxon']):
            return

        # save the breed keys in the test_ids for later processing
        self.test_ids['breed'] += [int(row['breed_id'])]

        breed_id = self.make_breed_id(row['breed_id'])

        self.id_hash['breed'][row['breed_id']] = breed_id
        tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
        breed_label = row['breed_name']
        species_label = self.label_hash.get(tax_id)
        if species_label is not None:
            breed_label = breed_label + ' ('+species_label+')'

        self.gu.addIndividualToGraph(self.g, breed_id, breed_label, tax_id)
        self.label_hash[breed_id] = breed_label

        return

    def _process_phene_row(self, row):

        phenotype_id = None
        sp_phene_label = row['phene_name']
        if sp_phene_label == '':
            sp_phene_label = None
        if 'omia_id' not in row:
            logger.info("omia_id not present for %s", row['phene_id'])
            omia_id = self._make_internal_id('phene', phenotype_id)
        else:
            omia_id = 'OMIA:'+str(row['omia_id'])

        if self.testMode and not\
                (int(row['gb_species_id']) in self.test_ids['taxon'] and
                 omia_id in self.test_ids['disease']):
            return
        # add to internal hash store for later lookup
        self.id_hash['phene'][row['phene_id']] = omia_id

        descr = row['summary']
        if descr == '':
            descr = None

        # omia label
        omia_label = self.label_hash.get(omia_id)

        # add the species-specific subclass (TODO please review this choice)
        gb_species_id = row['gb_species_id']

        if gb_species_id != '':
            sp_phene_id = '-'.join((omia_id, gb_species_id))
        else:
            logger.error(
                "No species supplied in species-specific phene table for %s",
                omia_id)
            return

        species_id = 'NCBITaxon:'+str(gb_species_id)
        # use this instead
        species_label = self.label_hash.get('NCBITaxon:'+gb_species_id)
        if sp_phene_label is None and \
                omia_label is not None and species_label is not None:
            sp_phene_label = ' '.join((omia_label, 'in', species_label))
        self.gu.addClassToGraph(
            self.g, sp_phene_id, sp_phene_label, omia_id, descr)
        # add to internal hash store for later lookup
        self.id_hash['phene'][row['phene_id']] = sp_phene_id
        self.label_hash[sp_phene_id] = sp_phene_label
        # add each of the following descriptions,
        # if they are populated, with a tag at the end.
        for item in [
                'clin_feat', 'history', 'pathology', 'mol_gen', 'control']:
            if row[item] is not None and row[item] != '':
                self.gu.addDescription(
                    self.g, sp_phene_id, row[item] + ' ['+item+']')
        # if row['symbol'] is not None:  # species-specific
        # CHECK ME - sometimes spaces or gene labels
        #     gu.addSynonym(g, sp_phene, row['symbol'])

        self.gu.addOWLPropertyClassRestriction(
            self.g, sp_phene_id, self.gu.object_properties['in_taxon'],
            species_id)

        # add inheritance as an association
        inheritance_id = self._map_inheritance_term_id(row['inherit'])
        if inheritance_id is not None:
            assoc = DispositionAssoc(self.name, sp_phene_id, inheritance_id)
            assoc.add_association_to_graph(self.g)

        if row['characterised'] == 'Yes':
            self.stored_omia_mol_gen[omia_id] = {
                'mol_gen': row['mol_gen'],
                'map_info': row['map_info'],
                'species': row['gb_species_id']}

        return

    def write_molgen_report(self):
        import csv
        logger.info("Writing G2P report for OMIA")
        f = '/'.join((self.outdir, 'omia_molgen_report.txt'))

        with open(f, 'w', newline='\n') as csvfile:
            writer = csv.writer(csvfile, delimiter='\t')
            # write header
            h = ['omia_id', 'molecular_description', 'mapping_info', 'species']
            writer.writerow(h)
            for phene in self.stored_omia_mol_gen:
                writer.writerow((str(phene),
                                 self.stored_omia_mol_gen[phene]['mol_gen'],
                                 self.stored_omia_mol_gen[phene]['map_info'],
                                 self.stored_omia_mol_gen[phene]['species']))

        logger.info(
            "Wrote %d potential G2P descriptions for curation to %s",
            len(self.stored_omia_mol_gen), f)

        return

    def _process_article_row(self, row):

        # don't bother in test mode
        if self.testMode:
            return

        iarticle_id = self._make_internal_id('article', row['article_id'])
        self.id_hash['article'][row['article_id']] = iarticle_id
        rtype = None
        if row['journal'] != '':
            rtype = Reference.ref_types['journal_article']
        r = Reference(iarticle_id, rtype)

        if row['title'] is not None:
            r.setTitle(row['title'].strip())
        if row['year'] is not None:
            r.setYear(row['year'])
        r.addRefToGraph(self.g)

        if row['pubmed_id'] is not None:
            pmid = 'PMID:'+str(row['pubmed_id'])
            self.id_hash['article'][row['article_id']] = pmid
            self.gu.addSameIndividual(self.g, iarticle_id, pmid)
            self.gu.addComment(self.g, pmid, iarticle_id)

        return

    def _process_omia_group_row(self, row):
        omia_id = 'OMIA:'+row['omia_id']

        if self.testMode and omia_id not in self.test_ids['disease']:
            return

        group_name = row['group_name']
        group_summary = row['group_summary']

        disease_id = None
        group_category = row.get('group_category')
        disease_id = \
            self.map_omia_group_category_to_ontology_id(group_category)
        if disease_id is not None:
            self.gu.addClassToGraph(self.g, disease_id, None)
            if disease_id == 'MP:0008762':  # embryonic lethal
                # add this as a phenotype association
                # add embryonic onset
                assoc = D2PAssoc(self.name, omia_id, disease_id)
                assoc.add_association_to_graph(self.g)
                disease_id = None
        else:
            logger.info(
                "No disease superclass defined for %s:  %s",
                omia_id, group_name)
            # default to general disease  FIXME this may not be desired
            disease_id = 'DOID:4'

        if group_summary == '':
            group_summary = None
        if group_name == '':
            group_name = None

        self.gu.addClassToGraph(
            self.g, omia_id, group_name, disease_id, group_summary)

        self.label_hash[omia_id] = group_name

        return

    def _process_gene_row(self, row):
        if self.testMode and row['gene_id'] not in self.test_ids['gene']:
            return
        gene_id = 'NCBIGene:'+str(row['gene_id'])
        self.id_hash['gene'][row['gene_id']] = gene_id
        gene_label = row['symbol']
        self.label_hash[gene_id] = gene_label
        tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
        gene_type_id = NCBIGene.map_type_of_gene(row['gene_type'])
        self.gu.addClassToGraph(self.g, gene_id, gene_label, gene_type_id)
        self.geno.addTaxon(tax_id, gene_id)

        return

    def _process_article_breed_row(self, row):
        # article_id, breed_id, added_by
        # don't bother putting these into the test... too many!

        # and int(row['breed_id']) not in self.test_ids['breed']:
        if self.testMode:
            return

        article_id = self.id_hash['article'].get(row['article_id'])
        breed_id = self.id_hash['breed'].get(row['breed_id'])

        # there's some missing data (article=6038).  in that case skip
        if article_id is not None:
            self.gu.addTriple(
                self.g, article_id, self.gu.object_properties['is_about'],
                breed_id)
        else:
            logger.warning("Missing article key %s", str(row['article_id']))

        return

    def _process_article_phene_row(self, row):
        """
        Linking articles to species-specific phenes.

        :param row:
        :return:
        """
        # article_id, phene_id, added_by
        # look up the article in the hashmap
        phenotype_id = self.id_hash['phene'].get(row['phene_id'])
        article_id = self.id_hash['article'].get(row['article_id'])

        omia_id = self._get_omia_id_from_phene_id(phenotype_id)
        if self.testMode and omia_id not in self.test_ids['disease'] \
                or phenotype_id is None or article_id is None:
            return

        # make a triple, where the article is about the phenotype
        self.gu.addTriple(
            self.g, article_id,
            self.gu.object_properties['is_about'], phenotype_id)

        return

    def _process_breed_phene_row(self, row):
        # Linking disorders/characteristic to breeds
        # breed_id, phene_id, added_by
        breed_id = self.id_hash['breed'].get(row['breed_id'])
        phene_id = self.id_hash['phene'].get(row['phene_id'])

        # get the omia id
        omia_id = self._get_omia_id_from_phene_id(phene_id)

        if (self.testMode and not (
                omia_id in self.test_ids['disease'] and
                int(row['breed_id']) in self.test_ids['breed']) or
                breed_id is None or phene_id is None):
            return

        # FIXME we want a different relationship here
        assoc = G2PAssoc(
            self.name, breed_id, phene_id,
            self.gu.object_properties['has_phenotype'])
        assoc.add_association_to_graph(self.g)

        # add that the breed is a model of the human disease
        # use the omia-omim mappings for this
        # we assume that we have already scrubbed out the genes
        # from the omim list, so we can make the model associations here

        omim_ids = self.omia_omim_map.get(omia_id)
        eco_id = "ECO:0000214"   # biological aspect of descendant evidence
        if omim_ids is not None and len(omim_ids) > 0:
            if len(omim_ids) > 1:
                logger.info(
                    "There's 1:many omia:omim mapping: %s, %s",
                    omia_id, str(omim_ids))
            for i in omim_ids:
                assoc = G2PAssoc(
                    self.name, breed_id, i,
                    self.gu.object_properties['model_of'])
                assoc.add_evidence(eco_id)
                assoc.add_association_to_graph(self.g)
                aid = assoc.get_association_id()

                breed_label = self.label_hash.get(breed_id)
                if breed_label is None:
                    breed_label = "this breed"

                m = re.search(r'\((.*)\)', breed_label)
                if m:
                    sp_label = m.group(1)
                else:
                    sp_label = ''

                phene_label = self.label_hash.get(phene_id)
                if phene_label is None:
                    phene_label = "phenotype"
                elif phene_label.endswith(sp_label):
                    # some of the labels we made already include the species;
                    # remove it to make a cleaner desc
                    phene_label = re.sub(r' in '+sp_label, '', phene_label)
                desc = ' '.join(
                    ("High incidence of", phene_label, "in", breed_label,
                     "suggests it to be a model of disease", i + "."))
                self.gu.addDescription(self.g, aid, desc)
        return

    def _process_lida_links_row(self, row):
        # lidaurl, omia_id, added_by
        omia_id = 'OMIA:'+row['omia_id']
        lidaurl = row['lidaurl']

        if self.testMode and omia_id not in self.test_ids['disease']:
            return

        self.gu.addXref(self.g, omia_id, lidaurl, True)

        return

    def _process_phene_gene_row(self, row):

        gene_id = self.id_hash['gene'].get(row['gene_id'])
        phene_id = self.id_hash['phene'].get(row['phene_id'])

        omia_id = self._get_omia_id_from_phene_id(phene_id)

        if self.testMode and not (
                omia_id in self.test_ids['disease'] and
                row['gene_id'] in self.test_ids['gene']) or\
                gene_id is None or phene_id is None:
            return

        # occasionally some phenes are missing!  (ex: 406)
        if phene_id is None:
            logger.warning("Phene id %s is missing", str(row['phene_id']))
            return

        gene_label = self.label_hash[gene_id]
        # some variant of gene_id has phenotype d
        vl = '_'+re.sub(r'NCBIGene:', '', str(gene_id)) + 'VL'
        if self.nobnodes:
            vl = ':'+vl
        self.geno.addAllele(vl, 'some variant of ' + gene_label)
        self.geno.addAlleleOfGene(vl, gene_id)
        assoc = G2PAssoc(self.name, vl, phene_id)
        assoc.add_association_to_graph(self.g)

        # add the gene id to the set of annotated genes
        # for later lookup by orthology
        self.annotated_genes.add(gene_id)

        return

    def _process_omia_omim_map(self, row):
        """
        Links OMIA groups to OMIM equivalents.
        :param row:
        :return:
        """
        # omia_id, omim_id, added_by

        omia_id = 'OMIA:'+row['omia_id']
        omim_id = 'OMIM:'+row['omim_id']

        # also store this for use when we say that a given animal is
        # a model of a disease
        if omia_id not in self.omia_omim_map:
            self.omia_omim_map[omia_id] = set()
        self.omia_omim_map[omia_id].add(omim_id)

        if self.testMode and omia_id not in self.test_ids['disease']:
            return

        self.gu.addXref(self.g, omia_id, omim_id)

        return

    def map_omia_group_category_to_ontology_id(self, category_num):
        """
        Using the category number in the OMIA_groups table,
        map them to a disease id.
        This may be superceeded by other MONDO methods.

        Platelet disorders will be more specific once
        https://github.com/obophenotype/human-disease-ontology/issues/46
        is fulfilled.

        :param category_num:
        :return:

        """

        category_map = {
            1: 'DOID:0014667',      # Inborn error of metabolism
            2: 'MESH:D004392',      # Dwarfism
            3: 'DOID:1682',         # congenital heart disease
            4: 'DOID:74',           # blood system disease
            5: 'DOID:3211',         # lysosomal storage disease
            6: 'DOID:16',           # integumentary system disease
            # --> retinal degeneration ==> OMIA:000830
            7: 'DOID:8466',         # progressive retinal atrophy
            8: 'DOID:0050572',      # Cone–rod dystrophy
            9: 'MESH:C536122',      # stationary night blindness
            10: 'Orphanet:98553',   # developmental retinal disorder
            11: 'DOID:5679',        # retinal disorder
            12: 'Orphanet:90771',   # Disorder of Sex Development
            #  - what to do about this one?
            13: 'MP:0008762',       # embryonic lethal
            # - not sure what to do with this
            14: None,               # blood group
            # FIXME make me more specific
            15: 'DOID:2218',        # intrinsic platelet disorder
            # FIXME make me more specific
            16: 'DOID:2218',        # extrinsic platelet disorder
            17: None  # transgenic ???
        }

        disease_id = None
        if category_num is not None and int(category_num) in category_map:
            disease_id = category_map.get(int(category_num))
            logger.info(
                "Found %s for category %s", str(disease_id), str(category_num))
        else:
            logger.info(
                "There's a group category I don't know anything about: %s",
                str(category_num))

        return disease_id

    def _process_group_mpo_row(self, row):
        """
        Make OMIA to MP associations
        :param row:
        :return:
        """
        omia_id = 'OMIA:'+row['omia_id']
        mpo_num = int(row['MPO_no'])
        mpo_id = 'MP:'+str(mpo_num).zfill(7)

        assoc = D2PAssoc(self.name, omia_id, mpo_id)
        assoc.add_association_to_graph(self.g)

        return

    def clean_up_omim_genes(self):
        omim = OMIM()
        # get all the omim ids
        allomimids = set()
        for omia in self.omia_omim_map:
            allomimids.update(self.omia_omim_map[omia])

        entries_that_are_phenotypes = omim.process_entries(
            list(allomimids), filter_keep_phenotype_entry_ids, None, None)
        logger.info(
            "Filtered out %d/%d entries that are genes or features",
            len(allomimids)-len(entries_that_are_phenotypes), len(allomimids))

        # now iterate again and remove those non-phenotype ids
        removed_count = 0
        for omia in self.omia_omim_map:
            ids = self.omia_omim_map[omia]
            cleanids = set()
            for i in ids:
                if i in entries_that_are_phenotypes:
                    cleanids.add(i)
                else:
                    removed_count += 1  # keep track of how many we've removed
            self.omia_omim_map[omia] = cleanids

        logger.info(
            "Removed %d omim ids from the omia-to-omim map", removed_count)

        return

    def _make_internal_id(self, prefix, key):

        iid = '_'+''.join(('omia', prefix, 'key', str(key)))
        if self.nobnodes:
            iid = ':'+iid

        return iid

    def make_breed_id(self, key):
        breed_id = 'OMIA-breed:'+str(key)

        return breed_id

    @staticmethod
    def _get_omia_id_from_phene_id(phene_id):
        omia_id = None
        if phene_id is not None:
            m = re.match(r'OMIA:\d+', str(phene_id))
            if m:
                omia_id = m.group(0)

        return omia_id

    @staticmethod
    def _map_inheritance_term_id(inheritance_symbol):

        inherit_map = {
            'A':  None,  # Autosomal
            'ACD': 'GENO:0000143',  # Autosomal co-dominant
            'ADV': None,  # autosomal dominant with variable expressivity
            'AID': 'GENO:0000259',  # autosomal incompletely dominant
            'ASD': 'GENO:0000145',  # autosomal semi-dominant
            # autosomal recessive, semi-lethal
            # using generic autosomal recessive
            'ASL': 'GENO:0000150',
            'D': 'GENO:0000147',  # autosomal dominant
            'M': None,  # multifactorial
            'MAT': None,  # Maternal
            # probably autosomal recessive
            # using generic autosomal recessive
            'PR':  'GENO:0000150',
            'R': 'GENO:0000150',  # Autosomal Recessive
            # Recessive Embryonic Lethal
            # using plain recessive
            'REL': 'GENO:0000148',
            # Autosomal Recessive Lethal
            # using plain autosomal recessive
            'RL': 'GENO:0000150',
            'S': 'GENO:0000146',  # Sex-linked   <--using allosomal dominant
            'SLi': None,  # Sex-limited
            'UD': 'GENO:0000144',  # Dominant
            'X': None,  # x-linked    # HP:0001417 ?
            # X-linked Dominant     <-- temp using allosomal dominant  FIXME
            'XLD': 'GENO:0000146',
            # X-linked Recessive    <-- temp using allosomal recessive  FIXME
            'XLR': 'GENO:0000149',
            'Y': None,  # Y-linked
            'Z': None,  # Z-linked
            # Z-linked recessive    <-- temp using allosomal recessive  FIXME
            'ZR': 'GENO:0000149',
            '999': None,  # Z-linked incompletely dominant
        }

        inheritance_id = inherit_map.get(inheritance_symbol)
        if inheritance_id is None and inheritance_symbol is not None:
            logger.warning(
                "No inheritance id is mapped for %s", inheritance_symbol)

        return inheritance_id

    def getTestSuite(self):
        import unittest
        from tests.test_omia import OMIATestCase

        test_suite = unittest.TestLoader().loadTestsFromTestCase(OMIATestCase)

        return test_suite

Пример #11

Показать файл

Файл: GeneOntology.py Проект: JervenBolleman/dipper

    def process_gaf(self, file, limit, id_map=None):

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())
        geno = Genotype(g)
        logger.info("Processing Gene Associations from %s", file)
        line_counter = 0

        zfin = wbase = None
        if 7955 in self.tax_ids:
            zfin = ZFIN()
        elif 6239 in self.tax_ids:
            wbase = WormBase()

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue
                (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol,
                 with_or_from, aspect, gene_name, gene_synonym, object_type,
                 taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (db == '' or gene_num == '' or gene_symbol == '' or
                        go_id == '' or ref == '' or eco_symbol == '' or
                        aspect == '' or object_type == '' or taxon == '' or
                        date == '' or assigned_by == ''):
                    logger.error(
                        "Missing required part of annotation " +
                        "on row %d:\n"+'\t'.join(row),
                        line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                db = self.clean_db_prefix(db)
                uniprotid = None
                gene_id = None
                if db == 'UniProtKB':
                    mapped_ids = id_map.get(gene_num)
                    if id_map is not None and mapped_ids is not None:
                        if len(mapped_ids) == 1:
                            gene_id = mapped_ids[0]
                            uniprotid = ':'.join((db, gene_num))
                            gene_num = re.sub(r'\w+\:', '', gene_id)
                        elif len(mapped_ids) > 1:
                            # logger.warning(
                            #   "Skipping gene id mapped for >1 gene %s -> %s",
                            #    gene_num, str(mapped_ids))
                            continue
                    else:
                        continue
                elif db == 'MGI':
                    gene_num = re.sub(r'MGI:', '', gene_num)
                    gene_id = ':'.join((db, gene_num))
                    gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id)
                else:
                    gene_id = ':'.join((db, gene_num))

                if self.testMode \
                        and not(
                            re.match(r'NCBIGene', gene_id) and
                            int(gene_num) in self.test_ids):
                    continue

                gu.addClassToGraph(g, gene_id, gene_symbol)
                if gene_name != '':
                    gu.addDescription(g, gene_id, gene_name)
                if gene_synonym != '':
                    for s in re.split(r'\|', gene_synonym):
                        gu.addSynonym(g, gene_id, s.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    logger.info(">1 taxon (%s) on line %d.  skipping", taxon,
                                line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(self.name)

                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                eco_id = self.map_go_evidence_code_to_eco(eco_symbol)
                if eco_id is not None:
                    assoc.add_evidence(eco_id)

                refs = re.split(r'\|', ref)
                for r in refs:
                    r = r.strip()
                    if r != '':
                        prefix = re.split(r':', r)[0]
                        r = re.sub(prefix, self.clean_db_prefix(prefix), r)
                        r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                        ref = Reference(r)
                        if re.match(r'PMID', r):
                            ref_type = Reference.ref_types['journal_article']
                            ref.setType(ref_type)
                        ref.addRefToGraph(g)
                        assoc.add_source(r)

                # TODO add the source of the annotations from assigned by?

                aspect_rel_map = {
                    'P': gu.object_properties['involved_in'],  # involved in
                    'F': gu.object_properties['enables'],  # enables
                    'C': gu.object_properties['part_of']  # part of
                }

                if aspect not in aspect_rel_map:
                    logger.error("Aspect not recognized: %s", aspect)

                rel = aspect_rel_map.get(aspect)
                if aspect == 'F' and re.search(r'contributes_to', qualifier):
                    rel = gu.object_properties['contributes_to']
                assoc.set_relationship(rel)
                if uniprotid is not None:
                    assoc.set_description('Mapped from '+uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used

                assoc.add_association_to_graph(g)

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id+'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or \
                                re.match(
                                    r'(UniProtKB|WBPhenotype|InterPro|HGNC)',
                                    i):
                            logger.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s",
                                uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i, self.nobnodes)
                            geno.addReagentTargetedGene(i, gene_id,
                                                        targeted_gene_id)
                            # TODO PYLINT why is this:
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(self.name, targeted_gene_id,
                                             phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = \
                                wbase.make_reagent_targeted_gene_id(
                                    gene_id, i, self.nobnodes)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(
                                self.name, targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(self.name, i, phenotypeid)
                        for r in refs:
                            r = r.strip()
                            if r != '':
                                prefix = re.split(r':', r)[0]
                                r = re.sub(
                                    prefix, self.clean_db_prefix(prefix), r)
                                r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                                assoc.add_source(r)
                                # experimental phenotypic evidence
                                assoc.add_evidence("ECO:0000059")
                        assoc.add_association_to_graph(g, self.nobnodes)
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return

Пример #12

Показать файл

Файл: Dataset.py Проект: JervenBolleman/dipper

class Dataset:
    """
     this will produce the metadata about a dataset
     following the example laid out here:
     http://htmlpreview.github.io/?
     https://github.com/joejimbo/HCLSDatasetDescriptions/blob/master/Overview.html#appendix_1
     (mind the wrap)

    """

    namespaces = {
        'dctypes': 'http://purl.org/dc/dcmitype/',
        'pav': 'http://purl.org/pav/',
        'dcat': 'http://www.w3.org/ns/dcat#'
    }

    core_bindings = {'rdf': RDF, 'foaf': FOAF, 'xsd': XSD, 'dct': DCTERMS}

    def __init__(self, identifier, title, url, description=None,
                 license_url=None, data_rights=None):
        DCTYPES = Namespace(self.namespaces['dctypes'])
        self.gu = GraphUtils(curie_map.get())
        self.identifier = URIRef(':'+identifier)
        self.version = None
        self.date_issued = None
        self.date_accessed = None
        self.citation = set()
        self.set_access_date()
        self.license = license_url
        self.graph = Graph()
        self.load_bindings()
        self.graph.add((self.identifier, RDF['type'], DCTYPES['Dataset']))
        self.graph.add((self.identifier, DCTERMS['title'], Literal(title)))
        self.graph.add(
            (self.identifier, DCTERMS['identifier'], Literal(identifier)))
        self.graph.add((self.identifier, FOAF['page'], URIRef(url)))
        self.dipperized_version = URIRef('monarch'+str(self.date_accessed))
        # maybe in the future add the logo here:
        # schemaorg:logo <http://www.ebi.ac.uk/rdf/sites/ebi.ac.uk.rdf/files/resize/images/rdf/chembl_service_logo-146x48.gif> .

        # TODO add the licence info
        # FIXME:Temporarily making this in IF statement,
        #  can revert after all current resources are updated.
        if license_url is not None:
            self.graph.add(
                (self.identifier, DCTERMS['license'], URIRef(license_url)))
        else:
            logger.debug('No license provided.')
        if data_rights is not None:
            self.graph.add(
                (self.identifier, DCTERMS['rights'], Literal(data_rights)))
        else:
            logger.debug('No rights provided.')

        if description is not None:
            self.gu.addDescription(self.graph, self.identifier, description)
        return

    def load_bindings(self):
        for k in self.core_bindings:
            v = self.core_bindings[k]
            self.graph.bind(k, v)

        for k in self.namespaces.keys():
            v = self.namespaces[k]
            self.graph.bind(k, Namespace(v))

        return

    def setVersion(self, date_issued, version_id=None):
        """
        Legacy function...  should use the other set_* for version and date

        # TODO set as deprecated
        :param date_issued:
        :param version_id:
        :return:

        """

        if date_issued is not None:
            self.set_date_issued(date_issued)
        elif version_id is not None:
            # this shouldn't happen
            self.set_version_by_num(version_id)
        else:
            logger.error("No date or version set!")
            # TODO throw error
            return

        if version_id is not None:
            self.set_version_by_num(version_id)
        else:
            self.set_version_by_date(date_issued)

        logger.info("set version to %s", self.version)

        return

    def set_date_issued(self, date_issued):

        self.date_issued = date_issued
        self.graph.add(
            (self.identifier, DCTERMS['issued'], Literal(date_issued)))
        logger.info("setting date to %s", date_issued)

        return

    def set_version_by_date(self, date_issued=None):
        """
        This will set the version by the date supplied,
        the date already stored in the dataset description,
        or by the download date (today)
        :param date_issued:
        :return:
        """

        if date_issued is not None:
            d = date_issued
        elif self.date_issued is not None:
            d = self.date_issued
        else:
            d = self.date_accessed
            logger.info(
                "No date supplied for setting version; "
                "using download timestamp for date_issued")

        logger.info("setting version by date")
        self.set_version_by_num(d)

        return

    def set_version_by_num(self, version_num):
        PAV = Namespace(self.namespaces['pav'])

        self.version = URIRef(self.identifier+version_num)
        self.graph.add((self.version, DCTERMS['isVersionOf'], self.identifier))
        self.graph.add((self.version, PAV['version'], Literal(version_num)))

        logger.info("setting version to %s", self.version)

        # set the monarch-generated-version of the resource-version
        # TODO sync this up with the ontology version
        if version_num != self.date_accessed:
            self.dipperized_version = URIRef('monarch'+str(self.date_accessed))
            self.graph.add(
                (self.dipperized_version, DCTERMS['isVersionOf'],
                 self.version))
            self.graph.add(
                (self.dipperized_version, PAV['version'],
                 Literal(self.date_accessed)))
            self.graph.add(
                (self.dipperized_version, DCTERMS['issued'],
                 Literal(self.date_accessed, datatype=XSD.dateTime)))

        return

    def set_access_date(self):

        t = datetime.now()
        t_string = t.strftime("%Y-%m-%d-%H-%M")
        d = t_string
        self.date_accessed = d
        logger.info("Setting date of access to %s", self.date_accessed)

        return

    def setFileAccessUrl(self, url):
        DCAT = Namespace(self.namespaces['dcat'])
        self.graph.add((self.identifier, DCAT['accessURL'], URIRef(url)))
        return

    def getGraph(self):
        return self.graph

    def set_license(self, license):
        self.license = license
        return

    def get_license(self):

        return self.license

    def set_citation(self, citation_id):

        self.citation.add(citation_id)
        # TODO
        # gu.addTriple(self.identifier, 'cito:citeAsAuthority', citation_id)

        return