Пример #1
0
    def process_gene_desc(self, limit):
        raw = '/'.join((self.rawdir, self.files['gene_desc']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())

        logger.info("Processing Gene descriptions")
        line_counter = 0
        # geno = Genotype(g)  # TODO unused
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t',
                quotechar='\"')
            for row in filereader:
                if re.match(r'\#', ''.join(row)):
                    continue
                line_counter += 1
                if line_counter == 1:
                    continue
                (gene_num, public_name, molecular_name, concise_description,
                 provisional_description, detailed_description,
                 automated_description, gene_class_description) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                gene_id = 'WormBase:'+gene_num

                if concise_description != 'none available':
                    gu.addDefinition(g, gene_id, concise_description)

                # remove the description if it's identical to the concise
                descs = {
                    'provisional': provisional_description,
                    'automated': automated_description,
                    'detailed': detailed_description,
                    'gene class': gene_class_description
                }
                for d in descs:
                    text = descs.get(d)
                    if text == concise_description \
                            or re.match(r'none', text) or text == '':
                        pass  # don't use it
                    else:
                        text = ' '.join((text, '['+d+']'))
                        descs[d] = text
                        gu.addDescription(g, gene_id, text)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Пример #2
0
    def _process_straininfo(self, limit):
        # line_counter = 0  # TODO unused
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        logger.info("Processing measurements ...")
        raw = '/'.join((self.rawdir, self.files['straininfo']['file']))

        tax_id = 'NCBITaxon:10090'

        gu = GraphUtils(curie_map.get())

        with open(raw, 'r') as f:
            reader = csv.reader(f, delimiter=',', quotechar='\"')
            f.readline()  # read the header row; skip
            for row in reader:
                (strain_name, vendor, stocknum, panel, mpd_strainid,
                 straintype, n_proj, n_snp_datasets, mpdshortname, url) = row
                # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html
                # create the strain as an instance of the taxon
                if self.testMode and \
                        'MPD:'+str(mpd_strainid) not in self.test_ids:
                    continue
                strain_id = 'MPD-strain:'+str(mpd_strainid)
                gu.addIndividualToGraph(g, strain_id, strain_name, tax_id)
                if mpdshortname.strip() != '':
                    gu.addSynonym(g, strain_id, mpdshortname.strip())
                self.idlabel_hash[strain_id] = strain_name
                # make it equivalent to the vendor+stock
                if stocknum != '':
                    if vendor == 'J':
                        jax_id = 'JAX:'+stocknum
                        gu.addSameIndividual(g, strain_id, jax_id)
                    elif vendor == 'Rbrc':
                        # reiken
                        reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum)
                        gu.addSameIndividual(g, strain_id, reiken_id)
                    else:
                        if url != '':
                            gu.addXref(g, strain_id, url, True)
                        if vendor != '':
                            gu.addXref(
                                g, strain_id, ':'.join((vendor, stocknum)),
                                True)

                # add the panel information
                if panel != '':
                    desc = panel+' [panel]'
                    gu.addDescription(g, strain_id, desc)

                # TODO make the panels as a resource collection

        return
Пример #3
0
    def _process_ortholog_classes(self, limit=None):
        """
        This method add the KEGG orthology classes to the graph.

        Triples created:
        <orthology_class_id> is a class
        <orthology_class_id> has label <orthology_symbols>
        <orthology_class_id> has description <orthology_description>
        :param limit:
        :return:
        """

        logger.info("Processing ortholog classes")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (orthology_class_id, orthology_class_name) = row

                if self.testMode and orthology_class_id not in self.test_ids['ortholog_classes']:
                    continue

                # FIXME: What's the proper route for this?
                # The orthology class is essentially a KEGG gene ID that is species agnostic.
                # Add the ID and label as a class. Would it be considered a gene as well?

                other_labels = re.split(';', orthology_class_name)
                orthology_label = other_labels[0]  # the first one is the label we'll use

                orthology_class_id = 'KEGG-'+orthology_class_id.strip()

                orthology_type = OrthologyAssoc.terms['gene_family']
                gu.addClassToGraph(g, orthology_class_id, orthology_label, orthology_type)
                if len(other_labels) > 1:
                    # add the rest as synonyms
                    # todo skip the first
                    for s in other_labels:
                        gu.addSynonym(g, orthology_class_id, s)

                    # add the last one as the description
                    gu.addDescription(g, orthology_class_id, other_labels[len(other_labels)-1])

                if (not self.testMode) and (limit is not None and line_counter > limit):
                    break

        logger.info("Done with ortholog classes")
        return
Пример #4
0
    def process_gene_interaction(self, limit):
        """
        The gene interaction file includes identified interactions,
        that are between two or more gene (products).
        In the case of interactions with >2 genes, this requires creating
        groups of genes that are involved in the interaction.
        From the wormbase help list: In the example WBInteraction000007779
        it would likely be misleading to suggest that lin-12 interacts with
        (suppresses in this case) smo-1 ALONE or that lin-12 suppresses let-60
        ALONE; the observation in the paper; see Table V in paper PMID:15990876
        was that a lin-12 allele (heterozygous lin-12(n941/+)) could suppress
        the "multivulva" phenotype induced synthetically by simultaneous
        perturbation of BOTH smo-1 (by RNAi) AND let-60 (by the n2021 allele).
        So this is necessarily a three-gene interaction.

        Therefore, we can create groups of genes based on their "status" of
        Effector | Effected.

        Status:  IN PROGRESS

        :param limit:
        :return:

        """

        raw = '/'.join((self.rawdir, self.files['gene_interaction']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())

        logger.info("Processing gene interaction associations")
        line_counter = 0

        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t',
                quotechar="'")

            for row in filereader:
                line_counter += 1
                if re.match(r'#', ''.join(row)):
                    continue

                (interaction_num, interaction_type, interaction_subtype,
                 summary, citation) = row[0:5]
                print(row)
                interaction_id = 'WormBase:'+interaction_num

                # TODO deal with subtypes
                interaction_type_id = None
                if interaction_type == 'Genetic':
                    interaction_type_id = \
                        InteractionAssoc.interaction_object_properties[
                            'genetically_interacts_with']
                elif interaction_type == 'Physical':
                    interaction_type_id = \
                        InteractionAssoc.interaction_object_properties[
                            'molecularly_interacts_with']
                elif interaction_type == 'Regulatory':
                    interaction_type_id = \
                        InteractionAssoc.interaction_object_properties[
                            'regulates']
                else:
                    logger.info(
                        "An interaction type I don't understand %s",
                        interaction_type)

                num_interactors = (len(row) - 5) / 3
                if num_interactors != 2:
                    logger.info(
                        "Skipping interactions with !=2 participants:\n %s",
                        str(row))
                    continue

                gene_a_id = 'WormBase:'+row[5]
                gene_b_id = 'WormBase:'+row[8]

                if self.testMode \
                        and gene_a_id not in self.test_ids['gene'] \
                        and gene_b_id not in self.test_ids['gene']:
                    continue

                assoc = InteractionAssoc(
                    self.name, gene_a_id, gene_b_id, interaction_type_id)
                assoc.set_association_id(interaction_id)
                assoc.add_association_to_graph(g)
                assoc_id = assoc.get_association_id()
                # citation is not a pmid or WBref - get this some other way
                gu.addDescription(g, assoc_id, summary)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Пример #5
0
    def process_feature_loc(self, limit):

        raw = '/'.join((self.rawdir, self.files['feature_loc']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())

        logger.info("Processing Feature location and attributes")
        line_counter = 0
        geno = Genotype(g)
        strain_to_variant_map = {}
        build_num = self.version_num
        build_id = 'WormBase:'+build_num
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t',
                quotechar='\"')
            for row in filereader:
                if re.match(r'\#', ''.join(row)):
                    continue
                (chrom, db, feature_type_label, start, end, score, strand,
                 phase, attributes) = row

# I	interpolated_pmap_position	gene	1	559768	.	.	.	ID=gmap:spe-13;gmap=spe-13;status=uncloned;Note=-21.3602 cM (+/- 1.84 cM)
# I	WormBase	gene	3747	3909	.	-	.	ID=Gene:WBGene00023193;Name=WBGene00023193;interpolated_map_position=-21.9064;sequence_name=Y74C9A.6;biotype=snoRNA;Alias=Y74C9A.6
# I	absolute_pmap_position	gene	4119	10230	.	.	.	ID=gmap:homt-1;gmap=homt-1;status=cloned;Note=-21.8252 cM (+/- 0.00 cM)

                # dbs = re.split(
                #   r' ', 'assembly_component expressed_sequence_match Coding_transcript Genomic_canonical Non_coding_transcript Orfeome Promoterome Pseudogene RNAi_primary RNAi_secondary Reference Transposon Transposon_CDS cDNA_for_RNAi miRanda ncRNA operon polyA_signal_sequence polyA_site snlRNA')
                #
                # if db not in dbs:
                #     continue

                if feature_type_label not in [
                        'gene', 'point_mutation', 'deletion', 'RNAi_reagent',
                        'duplication', 'enhancer', 'binding_site',
                        'biological_region', 'complex_substitution',
                        'substitution', 'insertion', 'inverted_repeat']:
                    # note biological_regions include balancers
                    # other options here: promoter, regulatory_region, reagent
                    continue
                line_counter += 1

                attribute_dict = {}
                if attributes != '':
                    attribute_dict = dict(
                        item.split("=")for item in
                        re.sub(r'"', '', attributes).split(";"))

                fid = flabel = desc = None
                if 'ID' in attribute_dict:
                    fid = attribute_dict.get('ID')
                    if re.search(r'WB(Gene|Var|sf)', fid):
                        fid = re.sub(r'^\w+:WB', 'WormBase:WB', fid)
                    elif re.match(r'(gmap|landmark)', fid):
                        continue
                    else:
                        logger.info('other identifier %s', fid)
                        fid = None
                elif 'variation' in attribute_dict:
                    fid = 'WormBase:'+attribute_dict.get('variation')
                    flabel = attribute_dict.get('public_name')
                    sub = attribute_dict.get('substitution')
                    ins = attribute_dict.get('insertion')
                    # if it's a variation:
                    # variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T
                    desc = ''
                    if sub is not None:
                        desc = 'substitution='+sub
                    if ins is not None:
                        desc = 'insertion='+ins

                    # keep track of the strains with this variation,
                    # for later processing
                    strain_list = attribute_dict.get('strain')
                    if strain_list is not None:
                        for s in re.split(r',', strain_list):
                            if s.strip() not in strain_to_variant_map:
                                strain_to_variant_map[s.strip()] = set()
                            strain_to_variant_map[s.strip()].add(fid)

                # if feature_type_label == 'RNAi_reagent':
                    # Target=WBRNAi00096030 1 4942
                    # this will tell us where the RNAi is actually binding
                    # target = attribute_dict.get('Target') # TODO unused
                    # rnai_num = re.split(r' ', target)[0]  # TODO unused
                    # it will be the reagent-targeted-gene that has a position,
                    # (i think)
                    # TODO finish the RNAi binding location

                name = attribute_dict.get('Name')
                polymorphism = attribute_dict.get('polymorphism')

                if fid is None:
                    if name is not None and re.match(r'WBsf', name):
                        fid = 'WormBase:'+name
                        name = None
                    else:
                        continue

                if self.testMode \
                        and re.sub(r'WormBase:', '', fid) \
                        not in self.test_ids['gene']+self.test_ids['allele']:
                    continue

                # these really aren't that interesting
                if polymorphism is not None:
                    continue

                if name is not None and not re.search(name, fid):
                    if flabel is None:
                        flabel = name
                    else:
                        gu.addSynonym(g, fid, name)

                if desc is not None:
                    gu.addDescription(g, fid, desc)

                alias = attribute_dict.get('Alias')

                biotype = attribute_dict.get('biotype')
                note = attribute_dict.get('Note')
                other_name = attribute_dict.get('other_name')
                for n in [alias, other_name]:
                    if n is not None:
                        gu.addSynonym(g, fid, other_name)

                ftype = self.get_feature_type_by_class_and_biotype(
                    feature_type_label, biotype)

                chr_id = makeChromID(chrom, build_id, 'CHR')
                geno.addChromosomeInstance(chrom, build_id, build_num)

                f = Feature(fid, flabel, ftype)
                f.addFeatureStartLocation(start, chr_id, strand)
                f.addFeatureEndLocation(start, chr_id, strand)

                feature_is_class = False
                if feature_type_label == 'gene':
                    feature_is_class = True

                f.addFeatureToGraph(g, True, None, feature_is_class)

                if note is not None:
                    gu.addDescription(g, fid, note)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

                # RNAi reagents:
# I	RNAi_primary	RNAi_reagent	4184	10232	.	+	.	Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10
# I	RNAi_primary	RNAi_reagent	4223	10147	.	+	.	Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052
# I	RNAi_primary	RNAi_reagent	5693	9391	.	+	.	Target=WBRNAi00066135 1 3699 +;laboratory=CH

                # TODO TF bindiing sites and network:
# I	TF_binding_site_region	TF_binding_site	1861	2048	.	+	.	Name=WBsf292777;tf_id=WBTranscriptionFactor000025;tf_name=DAF-16
# I	TF_binding_site_region	TF_binding_site	3403	4072	.	+	.	Name=WBsf331847;tf_id=WBTranscriptionFactor000703;tf_name=DPL-1

        return
Пример #6
0
    def _process_data(self, raw, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

            line_id a CL_0000057,  #fibroblast line
                derives_from patient_id
                part_of :NIGMSrepository
                RO:model_of OMIM:disease_id

            patient id a foaf:person,
                label: "fibroblast from patient 12345 with disease X"
                member_of family_id  #what is the right thing here?
                SIO:race EFO:caucasian  #subclass of EFO:0001799
                in_taxon NCBITaxon:9606
                dc:description Literal(remark)
                RO:has_phenotype OMIM:disease_id
                GENO:has_genotype genotype_id

            family_id a owl:NamedIndividual
                foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

            genotype_id a intrinsic_genotype
                GENO:has_alternate_part allelic_variant_id
                we don't necessarily know much about the genotype,
                other than the allelic variant. also there's the sex here

            pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:
        """
        logger.info("Processing Data from %s", raw)
        gu = GraphUtils(curie_map.get())

        if self.testMode:      # set the graph to build
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        geno = Genotype(g)
        du = DipperUtil()

        gu.loadProperties(g, geno.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                if not row:
                    pass
                else:
                    line_counter += 1

                    (catalog_id, description, omim_number, sample_type,
                     cell_line_available, dna_in_stock, dna_ref, gender, age,
                     race, ethnicity, affected, karyotype, relprob, mutation,
                     gene, family_id, collection, url, cat_remark, pubmed_ids,
                     family_member, variant_id, dbsnp_id, species) = row

                    # example:
                    # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,,
                    # parent,,,39,NIGMS Human Genetic Cell Repository,
                    # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                    # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,,
                    # 2,,18343,H**o sapiens

                    if self.testMode and catalog_id not in self.test_lines:
                        # skip rows not in our test lines, when in test mode
                        continue

                    # ###########    BUILD REQUIRED VARIABLES    ###########

                    # Make the cell line ID
                    cell_line_id = 'Coriell:'+catalog_id.strip()

                    # Map the cell/sample type
                    cell_type = self._map_cell_type(sample_type)

                    # Make a cell line label
                    line_label = \
                        collection.partition(' ')[0]+'-'+catalog_id.strip()

                    # Map the repository/collection
                    repository = self._map_collection(collection)

                    # patients are uniquely identified by one of:
                    # dbsnp id (which is == an individual haplotype)
                    # family id + family member (if present) OR
                    # probands are usually family member zero
                    # cell line id
                    # since some patients have >1 cell line derived from them,
                    # we must make sure that the genotype is attached to
                    # the patient, and can be inferred to the cell line
                    # examples of repeated patients are:
                    #   famid=1159, member=1; fam=152,member=1

                    # Make the patient ID

                    # make an anonymous patient
                    patient_id = '_person'
                    if self.nobnodes:
                        patient_id = ':'+patient_id
                    if family_id != '':
                        patient_id = \
                            '-'.join((patient_id, family_id, family_member))
                    else:
                        # make an anonymous patient
                        patient_id = '-'.join((patient_id, catalog_id.strip()))

                    # properties of the individual patients:  sex, family id,
                    # member/relproband, description descriptions are
                    # really long and ugly SCREAMING text, so need to clean up
                    # the control cases are so odd with this labeling scheme;
                    # but we'll deal with it as-is for now.
                    short_desc = (description.split(';')[0]).capitalize()
                    if affected == 'Yes':
                        affected = 'affected'
                    elif affected == 'No':
                        affected = 'unaffected'
                    gender = gender.lower()
                    patient_label = ' '.join((affected, gender, relprob))
                    if relprob == 'proband':
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'with', short_desc))
                    else:
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'of proband with',
                                 short_desc))

                    # #############    BUILD THE CELL LINE    #############

                    # Adding the cell line as a typed individual.
                    cell_line_reagent_id = 'CLO:0000031'

                    gu.addIndividualToGraph(
                        g, cell_line_id, line_label, cell_line_reagent_id)

                    # add the equivalent id == dna_ref
                    if dna_ref != '' and dna_ref != catalog_id:
                        equiv_cell_line = 'Coriell:'+dna_ref
                        # some of the equivalent ids are not defined
                        # in the source data; so add them
                        gu.addIndividualToGraph(
                            g, equiv_cell_line, None, cell_line_reagent_id)
                        gu.addSameIndividual(g, cell_line_id, equiv_cell_line)

                    # Cell line derives from patient
                    geno.addDerivesFrom(cell_line_id, patient_id)
                    geno.addDerivesFrom(cell_line_id, cell_type)

                    # Cell line a member of repository
                    gu.addMember(g, repository, cell_line_id)

                    if cat_remark != '':
                        gu.addDescription(g, cell_line_id, cat_remark)

                    # Cell age_at_sampling
                    # TODO add the age nodes when modeled properly in #78
                    # if (age != ''):
                        # this would give a BNode that is an instance of Age.
                        # but i don't know how to connect
                        # the age node to the cell line? we need to ask @mbrush
                        # age_id = '_'+re.sub('\s+','_',age)
                        # gu.addIndividualToGraph(
                        #   g,age_id,age,self.terms['age'])
                        # gu.addTriple(
                        #   g,age_id,self.properties['has_measurement'],age,
                        #   True)

                    # #############    BUILD THE PATIENT    #############

                    # Add the patient ID as an individual.
                    gu.addPerson(g, patient_id, patient_label)
                    # TODO map relationship to proband as a class
                    # (what ontology?)

                    # Add race of patient
                    # FIXME: Adjust for subcategories based on ethnicity field
                    # EDIT: There are 743 different entries for ethnicity...
                    # Too many to map?
                    # Add ethnicity as literal in addition to the mapped race?
                    # Adjust the ethnicity txt (if using)
                    # to initial capitalization to remove ALLCAPS

                    # TODO race should go into the individual's background
                    # and abstracted out to the Genotype class punting for now.
                    # if race != '':
                    #    mapped_race = self._map_race(race)
                    #    if mapped_race is not None:
                    #        gu.addTriple(
                    #           g,patient_id,self.terms['race'],mapped_race)
                    #        gu.addSubclass(
                    #           g,self.terms['ethnic_group'],mapped_race)

                    # #############    BUILD THE FAMILY    #############

                    # Add triples for family_id, if present.
                    if family_id != '':
                        family_comp_id = 'CoriellFamily:'+family_id

                        family_label = \
                            ' '.join(('Family of proband with', short_desc))

                        # Add the family ID as a named individual
                        gu.addIndividualToGraph(
                            g, family_comp_id, family_label,
                            geno.genoparts['family'])

                        # Add the patient as a member of the family
                        gu.addMemberOf(g, patient_id, family_comp_id)

                    # #############    BUILD THE GENOTYPE   #############

                    # the important things to pay attention to here are:
                    # karyotype = chr rearrangements  (somatic?)
                    # mutation = protein-level mutation as a label,
                    # often from omim
                    # gene = gene symbol - TODO get id
                    # variant_id = omim variant ids (; delimited)
                    # dbsnp_id = snp individual ids = full genotype?

                    # note GM00633 is a good example of chromosomal variation
                    # - do we have enough to capture this?
                    # GM00325 has both abnormal karyotype and variation

                    # make an assumption that if the taxon is blank,
                    # that it is human!
                    if species is None or species == '':
                        species = 'H**o sapiens'
                    taxon = self._map_species(species)

                    # if there's a dbSNP id,
                    # this is actually the individual's genotype
                    genotype_id = None
                    genotype_label = None
                    if dbsnp_id != '':
                        genotype_id = 'dbSNPIndividual:'+dbsnp_id.strip()

                    omim_map = {}
                    gvc_id = None

                    # some of the karyotypes are encoded
                    # with terrible hidden codes. remove them here
                    # i've seen a <98> character
                    karyotype = du.remove_control_characters(karyotype)
                    karyotype_id = None
                    if karyotype.strip() != '':
                        karyotype_id = \
                            '_'+re.sub('MONARCH:', '', self.make_id(karyotype))
                        if self.nobnodes:
                            karyotype_id = ':'+karyotype_id
                        # add karyotype as karyotype_variation_complement
                        gu.addIndividualToGraph(
                            g, karyotype_id, karyotype,
                            geno.genoparts['karyotype_variation_complement'])
                        # TODO break down the karyotype into parts
                        # and map into GENO. depends on #77

                        # place the karyotype in a location(s).
                        karyo_chrs = \
                            self._get_affected_chromosomes_from_karyotype(
                                karyotype)
                        for c in karyo_chrs:
                            chr_id = makeChromID(c, taxon, 'CHR')
                            # add an anonymous sequence feature,
                            # each located on chr
                            karyotype_feature_id = '-'.join((karyotype_id, c))
                            karyotype_feature_label = \
                                'some karyotype alteration on chr'+str(c)
                            f = Feature(
                                karyotype_feature_id, karyotype_feature_label,
                                geno.genoparts['sequence_alteration'])
                            f.addFeatureStartLocation(None, chr_id)
                            f.addFeatureToGraph(g)
                            f.loadAllProperties(g)
                            geno.addParts(
                                karyotype_feature_id, karyotype_id,
                                geno.object_properties['has_alternate_part'])

                    if gene != '':
                        vl = gene+'('+mutation+')'

                    # fix the variant_id so it's always in the same order
                    vids = variant_id.split(';')
                    variant_id = ';'.join(sorted(list(set(vids))))

                    if karyotype.strip() != '' \
                            and not self._is_normal_karyotype(karyotype):
                        mutation = mutation.strip()
                        gvc_id = karyotype_id
                        if variant_id != '':
                            gvc_id = '_' + variant_id.replace(';', '-') + '-' \
                                    + re.sub(r'\w*:', '', karyotype_id)
                        if mutation.strip() != '':
                            gvc_label = '; '.join((vl, karyotype))
                        else:
                            gvc_label = karyotype
                    elif variant_id.strip() != '':
                        gvc_id = '_' + variant_id.replace(';', '-')
                        gvc_label = vl
                    else:
                        # wildtype?
                        pass

                    if gvc_id is not None and gvc_id != karyotype_id \
                            and self.nobnodes:
                        gvc_id = ':'+gvc_id

                    # add the karyotype to the gvc.
                    # use reference if normal karyotype
                    karyo_rel = geno.object_properties['has_alternate_part']
                    if self._is_normal_karyotype(karyotype):
                        karyo_rel = \
                            geno.object_properties['has_reference_part']
                    if karyotype_id is not None \
                            and not self._is_normal_karyotype(karyotype) \
                            and gvc_id is not None and karyotype_id != gvc_id:
                        geno.addParts(karyotype_id, gvc_id, karyo_rel)

                    if variant_id.strip() != '':
                        # split the variants & add them as part of the genotype
                        # we don't necessarily know their zygosity,
                        # just that they are part of the genotype variant ids
                        # are from OMIM, so prefix as such we assume that the
                        # sequence alts will be defined in OMIM not here
                        # TODO sort the variant_id list, if the omim prefix is
                        # the same, then assume it's the locus make a hashmap
                        # of the omim id to variant id list;
                        # then build the genotype hashmap is also useful for
                        # removing the "genes" from the list of "phenotypes"

                        # will hold gene/locus id to variant list
                        omim_map = {}

                        locus_num = None
                        for v in variant_id.split(';'):
                            # handle omim-style and odd var ids
                            # like 610661.p.R401X
                            m = re.match(r'(\d+)\.+(.*)', v.strip())
                            if m is not None and len(m.groups()) == 2:
                                (locus_num, var_num) = m.groups()

                            if locus_num is not None \
                                    and locus_num not in omim_map:
                                omim_map[locus_num] = [var_num]
                            else:
                                omim_map[locus_num] += [var_num]

                        for o in omim_map:
                            # gene_id = 'OMIM:' + o  # TODO unused
                            vslc_id = \
                                '_' + '-'.join(
                                    [o + '.' + a for a in omim_map.get(o)])
                            if self.nobnodes:
                                vslc_id = ':'+vslc_id
                            vslc_label = vl
                            # we don't really know the zygosity of
                            # the alleles at all.
                            # so the vslcs are just a pot of them
                            gu.addIndividualToGraph(
                                g, vslc_id, vslc_label,
                                geno.genoparts[
                                    'variant_single_locus_complement'])
                            for v in omim_map.get(o):
                                # this is actually a sequence alt
                                allele1_id = 'OMIM:'+o+'.'+v
                                geno.addSequenceAlteration(allele1_id, None)

                                # assume that the sa -> var_loc -> gene
                                # is taken care of in OMIM
                                geno.addPartsToVSLC(
                                    vslc_id, allele1_id, None,
                                    geno.zygosity['indeterminate'],
                                    geno.object_properties[
                                        'has_alternate_part'])

                            if vslc_id != gvc_id:
                                geno.addVSLCtoParent(vslc_id, gvc_id)

                    if affected == 'unaffected':
                        # let's just say that this person is wildtype
                        gu.addType(g, patient_id, geno.genoparts['wildtype'])
                    elif genotype_id is None:
                        # make an anonymous genotype id
                        genotype_id = '_geno'+catalog_id.strip()
                        if self.nobnodes:
                            genotype_id = ':'+genotype_id

                    # add the gvc
                    if gvc_id is not None:
                        gu.addIndividualToGraph(
                            g, gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])

                        # add the gvc to the genotype
                        if genotype_id is not None:
                            if affected == 'unaffected':
                                rel = \
                                    geno.object_properties[
                                        'has_reference_part']
                            else:
                                rel = \
                                    geno.object_properties[
                                        'has_alternate_part']
                            geno.addParts(gvc_id, genotype_id, rel)
                        if karyotype_id is not None \
                                and self._is_normal_karyotype(karyotype):
                            if gvc_label is not None and gvc_label != '':
                                genotype_label = \
                                    '; '.join((gvc_label, karyotype))
                            else:
                                genotype_label = karyotype
                            if genotype_id is None:
                                genotype_id = karyotype_id
                            else:
                                geno.addParts(
                                    karyotype_id, genotype_id,
                                    geno.object_properties[
                                        'has_reference_part'])
                        else:
                            genotype_label = gvc_label
                            # use the catalog id as the background
                        genotype_label += ' ['+catalog_id.strip()+']'

                    if genotype_id is not None and gvc_id is not None:
                        # only add the genotype if it has some parts
                        geno.addGenotype(
                            genotype_id, genotype_label,
                            geno.genoparts['intrinsic_genotype'])
                        geno.addTaxon(taxon, genotype_id)
                        # add that the patient has the genotype
                        # TODO check if the genotype belongs to
                        # the cell line or to the patient
                        gu.addTriple(
                            g, patient_id,
                            geno.properties['has_genotype'], genotype_id)
                    else:
                        geno.addTaxon(taxon, patient_id)

                    # TODO: Add sex/gender  (as part of the karyotype?)

                    # #############    DEAL WITH THE DISEASES   #############

                    # we associate the disease to the patient
                    if affected == 'affected':
                        if omim_number != '':
                            for d in omim_number.split(';'):
                                if d is not None and d != '':
                                    # if the omim number is in omim_map,
                                    # then it is a gene not a pheno
                                    if d not in omim_map:
                                        disease_id = 'OMIM:'+d.strip()
                                        # assume the label is taken care of
                                        gu.addClassToGraph(g, disease_id, None)

                                        # add the association:
                                        #   the patient has the disease
                                        assoc = G2PAssoc(
                                            self.name, patient_id, disease_id)
                                        assoc.add_association_to_graph(g)

                                        # this line is a model of this disease
                                        # TODO abstract out model into
                                        # it's own association class?
                                        gu.addTriple(
                                            g, cell_line_id,
                                            gu.properties['model_of'],
                                            disease_id)
                                    else:
                                        logger.info(
                                            'removing %s from disease list ' +
                                            'since it is a gene', d)

                    # #############    ADD PUBLICATIONS   #############

                    if pubmed_ids != '':
                        for s in pubmed_ids.split(';'):
                            pubmed_id = 'PMID:'+s.strip()
                            ref = Reference(pubmed_id)
                            ref.setType(Reference.ref_types['journal_article'])
                            ref.addRefToGraph(g)
                            gu.addTriple(
                                g, pubmed_id, gu.properties['mentions'],
                                cell_line_id)

                    if not self.testMode \
                            and (limit is not None and line_counter > limit):
                        break

            Assoc(self.name).load_all_properties(g)

        return
Пример #7
0
class Assoc:
    """
    An abstract class for OBAN (Monarch)-style associations,
    to enable attribution of source and evidence
    on statements.
    """

    assoc_types = {
        'association': 'OBAN:association'
    }

    annotation_properties = {
        'replaced_by': 'IAO:0100001',
        'consider': 'OIO:consider',
        'hasExactSynonym': 'OIO:hasExactSynonym',
        'hasRelatedSynonym': 'OIO:hasRelatedSynonym',
        'definition': 'IAO:0000115',
        'has_xref': 'OIO:hasDbXref',
    }

    object_properties = {
        'has_disposition': 'GENO:0000208',
        'has_phenotype': 'RO:0002200',
        'in_taxon': 'RO:0002162',
        'has_quality': 'RO:0000086',
        'towards': 'RO:0002503',
        'has_subject': 'OBAN:association_has_subject',
        'has_object': 'OBAN:association_has_object',
        'has_predicate': 'OBAN:association_has_object_property',
        'is_about': 'IAO:00000136',
        'has_evidence': 'RO:0002558',
        'has_source': 'dc:source',
        'has_provenance': 'OBAN:has_provenance'
    }

    datatype_properties = {
        'position': 'faldo:position',
        'has_measurement': 'IAO:0000004'
    }

    properties = annotation_properties.copy()
    properties.update(object_properties)
    properties.update(datatype_properties)

    OWLCLASS = OWL['Class']
    OWLIND = OWL['NamedIndividual']
    OBJECTPROP = OWL['ObjectProperty']
    ANNOTPROP = OWL['AnnotationProperty']
    DATAPROP = OWL['DatatypeProperty']

    SUBCLASS = RDFS['subClassOf']
    BASE = Namespace(curie_map.get()[''])

    def __init__(self, definedby):
        self.cu = CurieUtil(curie_map.get())
        self.gu = GraphUtils(curie_map.get())

        # core parts of the association
        self.definedby = definedby
        self.sub = self.obj = self.rel = None
        self.assoc_id = None

        self.description = None
        self.source = []
        self.evidence = []
        # this is going to be used for the refactored evidence/provenance
        self.provenance = []

        self.score = None
        self.score_type = None
        self.score_unit = None

        return

    def get_properties(self):
        return self.properties

    def _is_valid(self):

        # check if sub/obj/rel are none...throw error
        if self.sub is None:
            raise ValueError('No subject set for this association')
        if self.obj is None:
            raise ValueError('No object set for this association')
        if self.rel is None:
            raise ValueError('No relation set for this association')

        return True

    def _add_basic_association_to_graph(self, g):

        if not self._is_valid():
            return

        # first, add the direct triple
        # anonymous (blank) nodes are indicated with underscore
        s = self.gu.getNode(self.sub)
        o = self.gu.getNode(self.obj)
        p = self.gu.getNode(self.rel)

        if s is None:
            logging.error(
                "Unable to retrieve graph node for Subject %s ", self.sub)
            return
                
        elif p is None:
            logging.error(
                "Unable to retrieve graph node for Predicate %s ", self.rel)
            return
                
        elif o is None:
            logging.error(
                "Unable to retrieve graph node for Object %s ", self.obj)
            return
        else:
            g.add((s, p, o))

        if self.assoc_id is None:
            self.set_association_id()

        node = self.gu.getNode(self.assoc_id)
        g.add((node, RDF['type'],
               self.gu.getNode(self.assoc_types['association'])))

        self.gu.addTriple(g, self.assoc_id,
                          self.object_properties['has_subject'], self.sub)
        self.gu.addTriple(g, self.assoc_id,
                          self.object_properties['has_object'], self.obj)
        self.gu.addTriple(g, self.assoc_id,
                          self.object_properties['has_predicate'], self.rel)

        if self.description is not None:
            self.gu.addDescription(g, self.assoc_id, self.description)

        if self.evidence is not None and len(self.evidence) > 0:
            for e in self.evidence:
                self.gu.addTriple(g, self.assoc_id,
                                  self.object_properties['has_evidence'], e)

        if self.source is not None and len(self.source) > 0:
            for s in self.source:
                if re.match('http', s):
                    # TODO assume that the source is a publication?
                    # use Reference class here
                    self.gu.addTriple(g, self.assoc_id,
                                      self.object_properties['has_source'], s,
                                      True)
                else:
                    self.gu.addTriple(g, self.assoc_id,
                                      self.object_properties['has_source'], s)

        if self.provenance is not None and len(self.provenance) > 0:
            for p in self.provenance:
                self.gu.addTriple(g, self.assoc_id,
                                  self.object_properties['has_provenance'], p)

        if self.score is not None:
            self.gu.addTriple(
                g, self.assoc_id, self.properties['has_measurement'],
                Literal(self.score, datatype=XSD['float']), True)
            # TODO
            # update with some kind of instance of scoring object
            # that has a unit and type

        return

    def add_association_to_graph(self, g):

        self._add_basic_association_to_graph(g)

        return

    def set_subject(self, identifier):
        self.sub = identifier
        return

    def set_object(self, identifier):
        self.obj = identifier
        return

    def set_relationship(self, identifier):
        self.rel = identifier
        return

    def set_association_id(self, assoc_id=None):
        """
        This will set the association ID based on the internal parts
            of the association.
        To be used in cases where an external association identifier
            should be used.

        :param assoc_id:
        :return:
        """
        if assoc_id is None:
            self.assoc_id = self.make_association_id(self.definedby, self.sub,
                                                     self.rel, self.obj)
        else:
            self.assoc_id = assoc_id

        return

    def get_association_id(self):

        return self.assoc_id

    def set_description(self, description):
        self.description = description

        return

    def set_score(self, score, unit=None, score_type=None):

        self.score = score
        self.score_unit = unit
        self.score_type = score_type

        return

    def add_evidence(self, identifier):
        """
        Add an evidence code to the association object (maintained as a list)
        :param identifier:
        :return:
        """

        if identifier is not None and identifier.strip() != '':
            self.evidence += [identifier]

        return

    def add_source(self, identifier):
        """
        Add a source identifier (such as publication id)
        to the association object (maintained as a list)
        TODO we need to greatly expand this function!

        :param identifier:
        :return:
        """

        if identifier is not None and identifier.strip() != '':
            self.source += [identifier]

        return

    def add_provenance(self, identifier):

        if identifier is not None and identifier.strip() != '':
            self.provenance += [identifier]

        return

    def load_all_properties(self, g):
        props = {
            self.OBJECTPROP: self.object_properties,
            self.ANNOTPROP: self.annotation_properties,
            self.DATAPROP: self.datatype_properties
        }

        for p in props:
            self.gu.loadProperties(g, props[p], p)

        return

    def _get_source_uri(self, pub_id):
        """
        Given some kind of pub_id (which might be a CURIE or url),
        convert it into a proper node.

        :param pub_id:
        :return: source: Well-formed URI for the given identifier (or url)
        """

        source = None
        if re.compile('http').match(pub_id):
            source = URIRef(pub_id)
        else:
            u = self.gu.getNode(pub_id)
            if u is not None:
                source = URIRef(u)
            else:
                logger.error(
                    "An id we don't know how to deal with: %s", pub_id)

        return source

    @staticmethod
    def make_association_id(definedby, subject, predicate, object,
                            attributes=None):
        """
        A method to create unique identifiers for OBAN-style associations,
        based on all the parts of the association
        If any of the items is empty or None, it will convert it to blank.
        It effectively md5 hashes the (+)-joined string from the values.
        Subclasses of Assoc can submit an additional array of attributes
        that will be added to the ID.

        :param definedby: The (data) resource that provided the annotation
        :param subject:
        :param predicate:
        :param object:
        :param attributes:
        :return:
        """

        # note others available:
        #   md5(), sha1(), sha224(), sha256(), sha384(), and sha512()
        # TEC: at our scale, md5 is in danger of having collisions.
        # putting definedby first,
        # as this will usually be the datasource providing the annotation
        # this will end up making the first few parts of the id
        # be the same for all annotations in that resource
        items_to_hash = [definedby, subject, predicate, object]
        if attributes is not None:
            items_to_hash += attributes

        for i, val in enumerate(items_to_hash):
            if val is None:
                items_to_hash[i] = ''

        byte_string = '+'.join(items_to_hash).encode("utf-8")

        # TODO put this in a util?
        return ':'.join(('MONARCH', hashlib.md5(byte_string).hexdigest()))
Пример #8
0
    def _process_ortholog_classes(self, limit=None):
        """
        This method add the KEGG orthology classes to the graph.

        If there's an embedded enzyme commission number,
        that is added as an xref.

        Triples created:
        <orthology_class_id> is a class
        <orthology_class_id> has label <orthology_symbols>
        <orthology_class_id> has description <orthology_description>
        :param limit:

        :return:
        """

        logger.info("Processing ortholog classes")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (orthology_class_id, orthology_class_name) = row

                if self.testMode and \
                        orthology_class_id not in \
                        self.test_ids['orthology_classes']:
                    continue

                # The orthology class is essentially a KEGG gene ID
                # that is species agnostic.
                # Add the ID and label as a gene family class

                other_labels = re.split(r'[;,]', orthology_class_name)
                # the first one is the label we'll use
                orthology_label = other_labels[0]

                orthology_class_id = 'KEGG-'+orthology_class_id.strip()

                orthology_type = OrthologyAssoc.terms['gene_family']
                gu.addClassToGraph(g, orthology_class_id, orthology_label,
                                   orthology_type)
                if len(other_labels) > 1:
                    # add the rest as synonyms
                    # todo skip the first
                    for s in other_labels:
                        gu.addSynonym(g, orthology_class_id, s.strip())

                    # add the last one as the description
                    d = other_labels[len(other_labels)-1]
                    gu.addDescription(g, orthology_class_id, d)

                    # add the enzyme commission number (EC:1.2.99.5)as an xref
                    # sometimes there's two, like [EC:1.3.5.1 1.3.5.4]
                    # can also have a dash, like EC:1.10.3.-
                    ec_matches = re.findall(r'((?:\d+|\.|-){5,7})', d)
                    if ec_matches is not None:
                        for ecm in ec_matches:
                            gu.addXref(g, orthology_class_id, 'EC:'+ecm)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        logger.info("Done with ortholog classes")
        return
Пример #9
0
    def _process_data(self, raw, limit=None):
        logger.info("Processing Data from %s", raw)
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)
        line_counter = 0
        gu.loadAllProperties(g)
        gu.loadObjectProperties(g, geno.object_properties)

        # Add the taxon as a class
        taxon_id = 'NCBITaxon:10090'  # map to Mus musculus
        gu.addClassToGraph(g, taxon_id, None)

        # with open(raw, 'r', encoding="utf8") as csvfile:
        with gzip.open(raw, 'rt') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                line_counter += 1

                (marker_accession_id, marker_symbol, phenotyping_center,
                 colony, sex, zygosity, allele_accession_id, allele_symbol,
                 allele_name, strain_accession_id, strain_name, project_name,
                 project_fullname, pipeline_name, pipeline_stable_id,
                 procedure_stable_id, procedure_name, parameter_stable_id,
                 parameter_name, top_level_mp_term_id, top_level_mp_term_name,
                 mp_term_id, mp_term_name, p_value, percentage_change,
                 effect_size, statistical_method, resource_name) = row

                if self.testMode and marker_accession_id not in self.test_ids:
                    continue

                # ##### cleanup some of the identifiers ######
                zygosity_id = self._map_zygosity(zygosity)

                # colony ids sometimes have <> in them, spaces,
                # or other non-alphanumerics and break our system;
                # replace these with underscores
                colony_id = '_'+re.sub(r'\W+', '_', colony)
                if self.nobnodes:
                    colony_id = ':'+colony_id

                if not re.match(r'MGI', allele_accession_id):
                    allele_accession_id = \
                        '_IMPC-'+re.sub(r':', '', allele_accession_id)
                    if self.nobnodes:
                        allele_accession_id = ':'+allele_accession_id
                if re.search(r'EUROCURATE', strain_accession_id):
                    # the eurocurate links don't resolve at IMPC
                    strain_accession_id = '_'+strain_accession_id
                    if self.nobnodes:
                        strain_accession_id = ':'+strain_accession_id
                elif not re.match(r'MGI', strain_accession_id):
                    logger.info(
                        "Found a strange strain accession...%s",
                        strain_accession_id)
                    strain_accession_id = 'IMPC:'+strain_accession_id

                ######################
                # first, add the marker and variant to the graph as with MGI,
                # the allele is the variant locus.  IF the marker is not known,
                # we will call it a sequence alteration.  otherwise,
                # we will create a BNode for the sequence alteration.
                sequence_alteration_id = variant_locus_id = None
                variant_locus_name = sequence_alteration_name = None

                # extract out what's within the <> to get the symbol
                if re.match(r'.*<.*>', allele_symbol):
                    sequence_alteration_name = \
                        re.match(r'.*<(.*)>', allele_symbol).group(1)
                else:
                    sequence_alteration_name = allele_symbol

                if marker_accession_id is not None and \
                        marker_accession_id == '':
                    logger.warning(
                        "Marker unspecified on row %d", line_counter)
                    marker_accession_id = None

                if marker_accession_id is not None:
                    variant_locus_id = allele_accession_id
                    variant_locus_name = allele_symbol
                    variant_locus_type = geno.genoparts['variant_locus']
                    geno.addGene(marker_accession_id, marker_symbol,
                                 geno.genoparts['gene'])
                    geno.addAllele(variant_locus_id, variant_locus_name,
                                   variant_locus_type, None)
                    geno.addAlleleOfGene(variant_locus_id, marker_accession_id)

                    sequence_alteration_id = \
                        '_seqalt'+re.sub(r':', '', allele_accession_id)
                    if self.nobnodes:
                        sequence_alteration_id = ':'+sequence_alteration_id
                    geno.addSequenceAlterationToVariantLocus(
                        sequence_alteration_id, variant_locus_id)

                else:
                    sequence_alteration_id = allele_accession_id

                # IMPC contains targeted mutations with either gene traps,
                # knockouts, insertion/intragenic deletions.
                # but I don't really know what the SeqAlt is here,
                # so I don't add it.
                geno.addSequenceAlteration(sequence_alteration_id,
                                           sequence_alteration_name)

                # #############    BUILD THE COLONY    #############
                # First, let's describe the colony that the animals come from
                # The Colony ID refers to the ES cell clone
                #   used to generate a mouse strain.
                # Terry sez: we use this clone ID to track
                #   ES cell -> mouse strain -> mouse phenotyping.
                # The same ES clone maybe used at multiple centers,
                # so we have to concatenate the two to have a unique ID.
                # some useful reading about generating mice from ES cells:
                # http://ki.mit.edu/sbc/escell/services/details

                # here, we'll make a genotype
                # that derives from an ES cell with a given allele.
                # the strain is not really attached to the colony.

                # the colony/clone is reflective of the allele,
                # with unknown zygosity
                stem_cell_class = 'ERO:0002002'
                gu.addIndividualToGraph(g, colony_id, colony, stem_cell_class)

                # vslc of the colony has unknown zygosity
                # note that we will define the allele
                # (and it's relationship to the marker, etc.) later
                # FIXME is it really necessary to create this vslc
                # when we always know it's unknown zygosity?
                vslc_colony = \
                    '_'+allele_accession_id+geno.zygosity['indeterminate']
                vslc_colony = re.sub(r':', '', vslc_colony)
                if self.nobnodes:
                    vslc_colony = ':'+vslc_colony
                vslc_colony_label = allele_symbol+'/<?>'
                # for ease of reading, we make the colony genotype variables.
                # in the future, it might be desired to keep the vslcs
                colony_genotype_id = vslc_colony
                colony_genotype_label = vslc_colony_label
                geno.addGenotype(colony_genotype_id, colony_genotype_label)
                geno.addParts(allele_accession_id, colony_genotype_id,
                              geno.object_properties['has_alternate_part'])
                geno.addPartsToVSLC(
                    vslc_colony, allele_accession_id, None,
                    geno.zygosity['indeterminate'],
                    geno.object_properties['has_alternate_part'])
                gu.addTriple(
                    g, colony_id,
                    geno.object_properties['has_genotype'],
                    colony_genotype_id)

                # ##########    BUILD THE ANNOTATED GENOTYPE    ##########
                # now, we'll build the genotype of the individual that derives
                # from the colony/clone genotype that is attached to
                # phenotype = colony_id + strain + zygosity + sex
                # (and is derived from a colony)

                # this is a sex-agnostic genotype
                genotype_id = \
                    self.make_id(
                        (colony_id + phenotyping_center + zygosity +
                         strain_accession_id))
                geno.addSequenceDerivesFrom(genotype_id, colony_id)

                # build the VSLC of the sex-agnostic genotype
                # based on the zygosity
                allele1_id = allele_accession_id
                allele2_id = allele2_rel = None
                allele1_label = allele_symbol
                allele2_label = '<?>'
                # Making VSLC labels from the various parts,
                # can change later if desired.
                if zygosity == 'heterozygote':
                    allele2_label = re.sub(r'<.*', '<+>', allele1_label)
                    allele2_id = None
                elif zygosity == 'homozygote':
                    allele2_label = allele1_label
                    allele2_id = allele1_id
                    allele2_rel = geno.object_properties['has_alternate_part']
                elif zygosity == 'hemizygote':
                    allele2_label = re.sub(r'<.*', '<0>', allele1_label)
                    allele2_id = None
                elif zygosity == 'not_applicable':
                    allele2_label = re.sub(r'<.*', '<?>', allele1_label)
                    allele2_id = None
                else:
                    logger.warning("found unknown zygosity %s", zygosity)
                    break
                vslc_name = '/'.join((allele1_label, allele2_label))

                # Add the VSLC
                vslc_id = '_' + '-'.join((marker_accession_id,
                                          allele_accession_id, zygosity))
                vslc_id = re.sub(r':', '', vslc_id)
                if self.nobnodes:
                    vslc_id = ':'+vslc_id
                gu.addIndividualToGraph(
                    g, vslc_id, vslc_name,
                    geno.genoparts['variant_single_locus_complement'])
                geno.addPartsToVSLC(
                    vslc_id, allele1_id, allele2_id, zygosity_id,
                    geno.object_properties['has_alternate_part'],
                    allele2_rel)

                # add vslc to genotype
                geno.addVSLCtoParent(vslc_id, genotype_id)

                # note that the vslc is also the gvc
                gu.addType(
                    g, vslc_id,
                    Genotype.genoparts['genomic_variation_complement'])

                # Add the genomic background
                # create the genomic background id and name
                if strain_accession_id != '':
                    genomic_background_id = strain_accession_id
                else:
                    genomic_background_id = None

                genotype_name = vslc_name
                if genomic_background_id is not None:
                    geno.addGenotype(
                        genomic_background_id, strain_name,
                        geno.genoparts['genomic_background'])

                    # make a phenotyping-center-specific strain
                    # to use as the background
                    pheno_center_strain_label = \
                        strain_name + '/' + phenotyping_center
                    pheno_center_strain_id = \
                        '-'.join((re.sub(r':', '', genomic_background_id),
                                  re.sub(r'\s', '_', phenotyping_center)))
                    if not re.match(r'^_', pheno_center_strain_id):
                        pheno_center_strain_id = '_'+pheno_center_strain_id
                    if self.nobnodes:
                        pheno_center_strain_id = ':'+pheno_center_strain_id
                    geno.addGenotype(pheno_center_strain_id,
                                     pheno_center_strain_label,
                                     geno.genoparts['genomic_background'])
                    geno.addSequenceDerivesFrom(pheno_center_strain_id,
                                                genomic_background_id)

                    # Making genotype labels from the various parts,
                    # can change later if desired.
                    # since the genotype is reflective of the place
                    # it got made, should put that in to disambiguate
                    genotype_name = \
                        genotype_name+' ['+pheno_center_strain_label+']'
                    geno.addGenomicBackgroundToGenotype(
                        pheno_center_strain_id, genotype_id)
                    geno.addTaxon(pheno_center_strain_id, taxon_id)
                # this is redundant, but i'll keep in in for now
                geno.addSequenceDerivesFrom(genotype_id, colony_id)
                genotype_name += '['+colony+']'
                geno.addGenotype(genotype_id, genotype_name)

                # Make the sex-qualified genotype,
                # which is what the phenotype is associated with
                sex_qualified_genotype_id = \
                    self.make_id(
                        (colony_id + phenotyping_center + zygosity +
                         strain_accession_id+sex))
                sex_qualified_genotype_label = genotype_name+' ('+sex+')'
                if sex == 'male':
                    sq_type_id = geno.genoparts['male_genotype']
                elif sex == 'female':
                    sq_type_id = geno.genoparts['female_genotype']
                else:
                    sq_type_id = geno.genoparts['sex_qualified_genotype']

                geno.addGenotype(
                    sex_qualified_genotype_id,
                    sex_qualified_genotype_label, sq_type_id)
                geno.addParts(
                    genotype_id, sex_qualified_genotype_id,
                    geno.object_properties['has_alternate_part'])

                if genomic_background_id is not None and \
                        genomic_background_id != '':
                    # Add the taxon to the genomic_background_id
                    geno.addTaxon(taxon_id, genomic_background_id)
                else:
                    # add it as the genomic background
                    geno.addTaxon(taxon_id, genotype_id)

                # #############    BUILD THE G2P ASSOC    #############
                # from an old email dated July 23 2014:
                # Phenotypes associations are made to
                # imits colony_id+center+zygosity+gender

                phenotype_id = mp_term_id

                # it seems that sometimes phenotype ids are missing.
                # indicate here
                if phenotype_id is None or phenotype_id == '':
                    logger.warning(
                        "No phenotype id specified for row %d: %s",
                        line_counter, str(row))
                    continue
                # experimental_phenotypic_evidence This was used in ZFIN
                eco_id = "ECO:0000059"

                # the association comes as a result of a g2p from
                # a procedure in a pipeline at a center and parameter tested

                assoc = G2PAssoc(self.name, sex_qualified_genotype_id,
                                 phenotype_id)
                assoc.add_evidence(eco_id)
                # assoc.set_score(float(p_value))

                # TODO add evidence instance using
                # pipeline_stable_id +
                # procedure_stable_id +
                # parameter_stable_id

                assoc.add_association_to_graph(g)
                assoc_id = assoc.get_association_id()

                # add a free-text description
                description = \
                    ' '.join((mp_term_name, 'phenotype determined by',
                              phenotyping_center, 'in an',
                              procedure_name, 'assay where',
                              parameter_name.strip(),
                              'was measured with an effect_size of',
                              str(round(float(effect_size), 5)),
                              '(p =', "{:.4e}".format(float(p_value)), ').'))

                gu.addDescription(g, assoc_id, description)

                # TODO add provenance information
                # resource_id = resource_name
                # assoc.addSource(g, assoc_id, resource_id)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP)
        gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP)

        return
Пример #10
0
class OMIA(Source):
    """
    This is the parser for the
    [Online Mendelian Inheritance in Animals
    (OMIA)](http://www.http://omia.angis.org.au),
    from which we process inherited disorders, other (single-locus) traits,
    and genes in >200 animal species (other than human and mouse and rats).

    We generate the omia graph to include the following information:
    * genes
    * animal taxonomy, and breeds as instances of those taxa
        (breeds are akin to "strains" in other taxa)
    * animal diseases, along with species-specific subtypes of those diseases
    * publications (and their mapping to PMIDs, if available)
    * gene-to-phenotype associations (via an anonymous variant-locus
    * breed-to-phenotype associations

    We make links between OMIA and OMIM in two ways:
    1.  mappings between OMIA and OMIM are created as OMIA --> hasdbXref OMIM
    2.  mappings between a breed and OMIA disease are created
        to be a model for the mapped OMIM disease,
        IF AND ONLY IF it is a 1:1 mapping.
        there are some 1:many mappings,
        and these often happen if the OMIM item is a gene.

    Because many of these species are not covered in
    the PANTHER orthology datafiles, we also pull any orthology
    relationships from the gene_group files from NCBI.

    """

    files = {
        'data': {
            'file': 'omia.xml.gz',
            'url': 'http://omia.angis.org.au/dumps/omia.xml.gz'},
    }

    def __init__(self):
        Source.__init__(self, 'omia')

        self.load_bindings()

        self.dataset = Dataset(
            'omia', 'Online Mendelian Inheritance in Animals',
            'http://omia.angis.org.au', None, None,
            'http://sydney.edu.au/disclaimer.shtml')

        self.id_hash = {
            'article': {},
            'phene': {},
            'breed': {},
            'taxon': {},
            'gene': {}
        }
        self.label_hash = {}
        self.gu = GraphUtils(curie_map.get())
        # used to store the omia to omim phene mappings
        self.omia_omim_map = {}
        # used to store the unique genes that have phenes
        # (for fetching orthology)
        self.annotated_genes = set()

        self.test_ids = {
            'disease': [
                'OMIA:001702', 'OMIA:001867', 'OMIA:000478', 'OMIA:000201',
                'OMIA:000810', 'OMIA:001400'],
            'gene': [
                492297, 434, 492296, 3430235, 200685834, 394659996, 200685845,
                28713538, 291822383],
            'taxon': [9691, 9685, 9606, 9615, 9913, 93934, 37029, 9627, 9825],
            # to be filled in during parsing of breed table
            # for lookup by breed-associations
            'breed': []
        }
        # to store a map of omia ids and any molecular info
        # to write a report for curation
        self.stored_omia_mol_gen = {}
        self.g = self.graph
        self.geno = Genotype(self.g)
        return

    def fetch(self, is_dl_forced=False):
        """
        :param is_dl_forced:
        :return:
        """
        self.get_files(is_dl_forced)

        ncbi = NCBIGene()
        # ncbi.fetch()
        gene_group = ncbi.files['gene_group']
        self.fetch_from_url(
            gene_group['url'], '/'.join((ncbi.rawdir, gene_group['file'])),
            False)

        return

    def parse(self, limit=None):
        # names of tables to iterate - probably don't need all these:
        # Article_Breed, Article_Keyword, Article_Gene, Article_Keyword,
        # Article_People, Article_Phene, Articles, Breed, Breed_Phene,
        # Genes_gb, Group_Categories, Group_MPO, Inherit_Type, Keywords,
        # Landmark, Lida_Links, OMIA_Group, OMIA_author, Omim_Xref, People,
        # Phene, Phene_Gene, Publishers, Resources, Species_gb, Synonyms

        self.scrub()

        if limit is not None:
            logger.info("Only parsing first %d rows", limit)

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        if self.testMode:
            self.g = self.testgraph
        else:
            self.g = self.graph
        self.geno = Genotype(self.g)

        # we do three passes through the file
        # first process species (two others reference this one)
        self.process_species(limit)

        # then, process the breeds, genes, articles, and other static stuff
        self.process_classes(limit)

        # next process the association data
        self.process_associations(limit)

        # process the vertebrate orthology for genes
        # that are annotated with phenotypes
        ncbi = NCBIGene()
        ncbi.add_orthologs_by_gene_group(self.g, self.annotated_genes)

        self.load_core_bindings()
        self.load_bindings()

        logger.info("Done parsing.")

        self.write_molgen_report()

        return

    def scrub(self):
        """
        The XML file seems to have mixed-encoding;
        we scrub out the control characters
        from the file for processing.
        :return:

        """

        logger.info(
            "Scrubbing out the nasty characters that break our parser.")

        myfile = '/'.join((self.rawdir, self.files['data']['file']))
        tmpfile = '/'.join((self.rawdir, self.files['data']['file']+'.tmp.gz'))
        t = gzip.open(tmpfile, 'wb')
        du = DipperUtil()
        with gzip.open(myfile, 'rb') as f:
            filereader = io.TextIOWrapper(f, newline="")
            for l in filereader:
                l = du.remove_control_characters(l) + '\n'
                t.write(l.encode('utf-8'))
        t.close()

        # move the temp file
        logger.info("Replacing the original data with the scrubbed file.")
        shutil.move(tmpfile, myfile)
        return

    # ###################### XML LOOPING FUNCTIONS ##################

    def process_species(self, limit):
        """
        Loop through the xml file and process the species.
        We add elements to the graph, and store the
        id-to-label in the label_hash dict.
        :param limit:
        :return:
        """

        myfile = '/'.join((self.rawdir, self.files['data']['file']))

        f = gzip.open(myfile, 'rb')
        filereader = io.TextIOWrapper(f, newline="")

        filereader.readline()  # remove the xml declaration line

        for event, elem in ET.iterparse(filereader):
            # Species ids are == genbank species ids!
            self.process_xml_table(
                elem, 'Species_gb', self._process_species_table_row, limit)

        f.close()

        return

    def process_classes(self, limit):
        """
        Loop through the xml file and process the articles,
        breed, genes, phenes, and phenotype-grouping classes.
        We add elements to the graph,
        and store the id-to-label in the label_hash dict,
        along with the internal key-to-external id in the id_hash dict.
        The latter are referenced in the association processing functions.

        :param limit:
        :return:

        """

        myfile = '/'.join((self.rawdir, self.files['data']['file']))

        f = gzip.open(myfile, 'rb')
        filereader = io.TextIOWrapper(f, newline="")

        filereader.readline()  # remove the xml declaration line

        parser = ET.XMLParser(encoding='utf-8')

        for event, elem in ET.iterparse(filereader, parser=parser):
            self.process_xml_table(
                elem, 'Articles', self._process_article_row, limit)
            self.process_xml_table(
                elem, 'Breed', self._process_breed_row, limit)
            self.process_xml_table(
                elem, 'Genes_gb', self._process_gene_row, limit)
            self.process_xml_table(
                elem, 'OMIA_Group', self._process_omia_group_row, limit)
            self.process_xml_table(
                elem, 'Phene', self._process_phene_row, limit)
            self.process_xml_table(
                elem, 'Omim_Xref', self._process_omia_omim_map, limit)

        f.close()

        # post-process the omia-omim associations to filter out the genes
        # (keep only phenotypes/diseases)
        self.clean_up_omim_genes()

        return

    def process_associations(self, limit):
        """
        Loop through the xml file and process the article-breed, article-phene,
        breed-phene, phene-gene associations, and the external links to LIDA.

        :param limit:
        :return:

        """

        myfile = '/'.join((self.rawdir, self.files['data']['file']))

        f = gzip.open(myfile, 'rb')
        filereader = io.TextIOWrapper(f, newline="")

        filereader.readline()  # remove the xml declaration line

        for event, elem in ET.iterparse(filereader):
            self.process_xml_table(
                elem, 'Article_Breed', self._process_article_breed_row, limit)
            self.process_xml_table(
                elem, 'Article_Phene', self._process_article_phene_row, limit)
            self.process_xml_table(
                elem, 'Breed_Phene', self._process_breed_phene_row, limit)
            self.process_xml_table(
                elem, 'Lida_Links', self._process_lida_links_row, limit)
            self.process_xml_table(
                elem, 'Phene_Gene', self._process_phene_gene_row, limit)
            self.process_xml_table(
                elem, 'Group_MPO', self._process_group_mpo_row, limit)

        f.close()

        return

    # ############ INDIVIDUAL TABLE-LEVEL PROCESSING FUNCTIONS ################

    def _process_species_table_row(self, row):
        # gb_species_id, sci_name, com_name, added_by, date_modified
        tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
        sci_name = row['sci_name']
        com_name = row['com_name']

        if self.testMode and \
                (int(row['gb_species_id']) not in self.test_ids['taxon']):
            return

        self.gu.addClassToGraph(self.g, tax_id, sci_name)
        if com_name != '':
            self.gu.addSynonym(self.g, tax_id, com_name)
            self.label_hash[tax_id] = com_name  # for lookup later
        else:
            self.label_hash[tax_id] = sci_name

        return

    def _process_breed_row(self, row):

        # in test mode, keep all breeds of our test species
        if self.testMode and \
                (int(row['gb_species_id']) not in self.test_ids['taxon']):
            return

        # save the breed keys in the test_ids for later processing
        self.test_ids['breed'] += [int(row['breed_id'])]

        breed_id = self.make_breed_id(row['breed_id'])

        self.id_hash['breed'][row['breed_id']] = breed_id
        tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
        breed_label = row['breed_name']
        species_label = self.label_hash.get(tax_id)
        if species_label is not None:
            breed_label = breed_label + ' ('+species_label+')'

        self.gu.addIndividualToGraph(self.g, breed_id, breed_label, tax_id)
        self.label_hash[breed_id] = breed_label

        return

    def _process_phene_row(self, row):

        phenotype_id = None
        sp_phene_label = row['phene_name']
        if sp_phene_label == '':
            sp_phene_label = None
        if 'omia_id' not in row:
            logger.info("omia_id not present for %s", row['phene_id'])
            omia_id = self._make_internal_id('phene', phenotype_id)
        else:
            omia_id = 'OMIA:'+str(row['omia_id'])

        if self.testMode and not\
                (int(row['gb_species_id']) in self.test_ids['taxon'] and
                 omia_id in self.test_ids['disease']):
            return
        # add to internal hash store for later lookup
        self.id_hash['phene'][row['phene_id']] = omia_id

        descr = row['summary']
        if descr == '':
            descr = None

        # omia label
        omia_label = self.label_hash.get(omia_id)

        # add the species-specific subclass (TODO please review this choice)
        gb_species_id = row['gb_species_id']

        if gb_species_id != '':
            sp_phene_id = '-'.join((omia_id, gb_species_id))
        else:
            logger.error(
                "No species supplied in species-specific phene table for %s",
                omia_id)
            return

        species_id = 'NCBITaxon:'+str(gb_species_id)
        # use this instead
        species_label = self.label_hash.get('NCBITaxon:'+gb_species_id)
        if sp_phene_label is None and \
                omia_label is not None and species_label is not None:
            sp_phene_label = ' '.join((omia_label, 'in', species_label))
        self.gu.addClassToGraph(
            self.g, sp_phene_id, sp_phene_label, omia_id, descr)
        # add to internal hash store for later lookup
        self.id_hash['phene'][row['phene_id']] = sp_phene_id
        self.label_hash[sp_phene_id] = sp_phene_label
        # add each of the following descriptions,
        # if they are populated, with a tag at the end.
        for item in [
                'clin_feat', 'history', 'pathology', 'mol_gen', 'control']:
            if row[item] is not None and row[item] != '':
                self.gu.addDescription(
                    self.g, sp_phene_id, row[item] + ' ['+item+']')
        # if row['symbol'] is not None:  # species-specific
        # CHECK ME - sometimes spaces or gene labels
        #     gu.addSynonym(g, sp_phene, row['symbol'])

        self.gu.addOWLPropertyClassRestriction(
            self.g, sp_phene_id, self.gu.object_properties['in_taxon'],
            species_id)

        # add inheritance as an association
        inheritance_id = self._map_inheritance_term_id(row['inherit'])
        if inheritance_id is not None:
            assoc = DispositionAssoc(self.name, sp_phene_id, inheritance_id)
            assoc.add_association_to_graph(self.g)

        if row['characterised'] == 'Yes':
            self.stored_omia_mol_gen[omia_id] = {
                'mol_gen': row['mol_gen'],
                'map_info': row['map_info'],
                'species': row['gb_species_id']}

        return

    def write_molgen_report(self):
        import csv
        logger.info("Writing G2P report for OMIA")
        f = '/'.join((self.outdir, 'omia_molgen_report.txt'))

        with open(f, 'w', newline='\n') as csvfile:
            writer = csv.writer(csvfile, delimiter='\t')
            # write header
            h = ['omia_id', 'molecular_description', 'mapping_info', 'species']
            writer.writerow(h)
            for phene in self.stored_omia_mol_gen:
                writer.writerow((str(phene),
                                 self.stored_omia_mol_gen[phene]['mol_gen'],
                                 self.stored_omia_mol_gen[phene]['map_info'],
                                 self.stored_omia_mol_gen[phene]['species']))

        logger.info(
            "Wrote %d potential G2P descriptions for curation to %s",
            len(self.stored_omia_mol_gen), f)

        return

    def _process_article_row(self, row):

        # don't bother in test mode
        if self.testMode:
            return

        iarticle_id = self._make_internal_id('article', row['article_id'])
        self.id_hash['article'][row['article_id']] = iarticle_id
        rtype = None
        if row['journal'] != '':
            rtype = Reference.ref_types['journal_article']
        r = Reference(iarticle_id, rtype)

        if row['title'] is not None:
            r.setTitle(row['title'].strip())
        if row['year'] is not None:
            r.setYear(row['year'])
        r.addRefToGraph(self.g)

        if row['pubmed_id'] is not None:
            pmid = 'PMID:'+str(row['pubmed_id'])
            self.id_hash['article'][row['article_id']] = pmid
            self.gu.addSameIndividual(self.g, iarticle_id, pmid)
            self.gu.addComment(self.g, pmid, iarticle_id)

        return

    def _process_omia_group_row(self, row):
        omia_id = 'OMIA:'+row['omia_id']

        if self.testMode and omia_id not in self.test_ids['disease']:
            return

        group_name = row['group_name']
        group_summary = row['group_summary']

        disease_id = None
        group_category = row.get('group_category')
        disease_id = \
            self.map_omia_group_category_to_ontology_id(group_category)
        if disease_id is not None:
            self.gu.addClassToGraph(self.g, disease_id, None)
            if disease_id == 'MP:0008762':  # embryonic lethal
                # add this as a phenotype association
                # add embryonic onset
                assoc = D2PAssoc(self.name, omia_id, disease_id)
                assoc.add_association_to_graph(self.g)
                disease_id = None
        else:
            logger.info(
                "No disease superclass defined for %s:  %s",
                omia_id, group_name)
            # default to general disease  FIXME this may not be desired
            disease_id = 'DOID:4'

        if group_summary == '':
            group_summary = None
        if group_name == '':
            group_name = None

        self.gu.addClassToGraph(
            self.g, omia_id, group_name, disease_id, group_summary)

        self.label_hash[omia_id] = group_name

        return

    def _process_gene_row(self, row):
        if self.testMode and row['gene_id'] not in self.test_ids['gene']:
            return
        gene_id = 'NCBIGene:'+str(row['gene_id'])
        self.id_hash['gene'][row['gene_id']] = gene_id
        gene_label = row['symbol']
        self.label_hash[gene_id] = gene_label
        tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
        gene_type_id = NCBIGene.map_type_of_gene(row['gene_type'])
        self.gu.addClassToGraph(self.g, gene_id, gene_label, gene_type_id)
        self.geno.addTaxon(tax_id, gene_id)

        return

    def _process_article_breed_row(self, row):
        # article_id, breed_id, added_by
        # don't bother putting these into the test... too many!

        # and int(row['breed_id']) not in self.test_ids['breed']:
        if self.testMode:
            return

        article_id = self.id_hash['article'].get(row['article_id'])
        breed_id = self.id_hash['breed'].get(row['breed_id'])

        # there's some missing data (article=6038).  in that case skip
        if article_id is not None:
            self.gu.addTriple(
                self.g, article_id, self.gu.object_properties['is_about'],
                breed_id)
        else:
            logger.warning("Missing article key %s", str(row['article_id']))

        return

    def _process_article_phene_row(self, row):
        """
        Linking articles to species-specific phenes.

        :param row:
        :return:
        """
        # article_id, phene_id, added_by
        # look up the article in the hashmap
        phenotype_id = self.id_hash['phene'].get(row['phene_id'])
        article_id = self.id_hash['article'].get(row['article_id'])

        omia_id = self._get_omia_id_from_phene_id(phenotype_id)
        if self.testMode and omia_id not in self.test_ids['disease'] \
                or phenotype_id is None or article_id is None:
            return

        # make a triple, where the article is about the phenotype
        self.gu.addTriple(
            self.g, article_id,
            self.gu.object_properties['is_about'], phenotype_id)

        return

    def _process_breed_phene_row(self, row):
        # Linking disorders/characteristic to breeds
        # breed_id, phene_id, added_by
        breed_id = self.id_hash['breed'].get(row['breed_id'])
        phene_id = self.id_hash['phene'].get(row['phene_id'])

        # get the omia id
        omia_id = self._get_omia_id_from_phene_id(phene_id)

        if (self.testMode and not (
                omia_id in self.test_ids['disease'] and
                int(row['breed_id']) in self.test_ids['breed']) or
                breed_id is None or phene_id is None):
            return

        # FIXME we want a different relationship here
        assoc = G2PAssoc(
            self.name, breed_id, phene_id,
            self.gu.object_properties['has_phenotype'])
        assoc.add_association_to_graph(self.g)

        # add that the breed is a model of the human disease
        # use the omia-omim mappings for this
        # we assume that we have already scrubbed out the genes
        # from the omim list, so we can make the model associations here

        omim_ids = self.omia_omim_map.get(omia_id)
        eco_id = "ECO:0000214"   # biological aspect of descendant evidence
        if omim_ids is not None and len(omim_ids) > 0:
            if len(omim_ids) > 1:
                logger.info(
                    "There's 1:many omia:omim mapping: %s, %s",
                    omia_id, str(omim_ids))
            for i in omim_ids:
                assoc = G2PAssoc(
                    self.name, breed_id, i,
                    self.gu.object_properties['model_of'])
                assoc.add_evidence(eco_id)
                assoc.add_association_to_graph(self.g)
                aid = assoc.get_association_id()

                breed_label = self.label_hash.get(breed_id)
                if breed_label is None:
                    breed_label = "this breed"

                m = re.search(r'\((.*)\)', breed_label)
                if m:
                    sp_label = m.group(1)
                else:
                    sp_label = ''

                phene_label = self.label_hash.get(phene_id)
                if phene_label is None:
                    phene_label = "phenotype"
                elif phene_label.endswith(sp_label):
                    # some of the labels we made already include the species;
                    # remove it to make a cleaner desc
                    phene_label = re.sub(r' in '+sp_label, '', phene_label)
                desc = ' '.join(
                    ("High incidence of", phene_label, "in", breed_label,
                     "suggests it to be a model of disease", i + "."))
                self.gu.addDescription(self.g, aid, desc)
        return

    def _process_lida_links_row(self, row):
        # lidaurl, omia_id, added_by
        omia_id = 'OMIA:'+row['omia_id']
        lidaurl = row['lidaurl']

        if self.testMode and omia_id not in self.test_ids['disease']:
            return

        self.gu.addXref(self.g, omia_id, lidaurl, True)

        return

    def _process_phene_gene_row(self, row):

        gene_id = self.id_hash['gene'].get(row['gene_id'])
        phene_id = self.id_hash['phene'].get(row['phene_id'])

        omia_id = self._get_omia_id_from_phene_id(phene_id)

        if self.testMode and not (
                omia_id in self.test_ids['disease'] and
                row['gene_id'] in self.test_ids['gene']) or\
                gene_id is None or phene_id is None:
            return

        # occasionally some phenes are missing!  (ex: 406)
        if phene_id is None:
            logger.warning("Phene id %s is missing", str(row['phene_id']))
            return

        gene_label = self.label_hash[gene_id]
        # some variant of gene_id has phenotype d
        vl = '_'+re.sub(r'NCBIGene:', '', str(gene_id)) + 'VL'
        if self.nobnodes:
            vl = ':'+vl
        self.geno.addAllele(vl, 'some variant of ' + gene_label)
        self.geno.addAlleleOfGene(vl, gene_id)
        assoc = G2PAssoc(self.name, vl, phene_id)
        assoc.add_association_to_graph(self.g)

        # add the gene id to the set of annotated genes
        # for later lookup by orthology
        self.annotated_genes.add(gene_id)

        return

    def _process_omia_omim_map(self, row):
        """
        Links OMIA groups to OMIM equivalents.
        :param row:
        :return:
        """
        # omia_id, omim_id, added_by

        omia_id = 'OMIA:'+row['omia_id']
        omim_id = 'OMIM:'+row['omim_id']

        # also store this for use when we say that a given animal is
        # a model of a disease
        if omia_id not in self.omia_omim_map:
            self.omia_omim_map[omia_id] = set()
        self.omia_omim_map[omia_id].add(omim_id)

        if self.testMode and omia_id not in self.test_ids['disease']:
            return

        self.gu.addXref(self.g, omia_id, omim_id)

        return

    def map_omia_group_category_to_ontology_id(self, category_num):
        """
        Using the category number in the OMIA_groups table,
        map them to a disease id.
        This may be superceeded by other MONDO methods.

        Platelet disorders will be more specific once
        https://github.com/obophenotype/human-disease-ontology/issues/46
        is fulfilled.

        :param category_num:
        :return:

        """

        category_map = {
            1: 'DOID:0014667',      # Inborn error of metabolism
            2: 'MESH:D004392',      # Dwarfism
            3: 'DOID:1682',         # congenital heart disease
            4: 'DOID:74',           # blood system disease
            5: 'DOID:3211',         # lysosomal storage disease
            6: 'DOID:16',           # integumentary system disease
            # --> retinal degeneration ==> OMIA:000830
            7: 'DOID:8466',         # progressive retinal atrophy
            8: 'DOID:0050572',      # Cone–rod dystrophy
            9: 'MESH:C536122',      # stationary night blindness
            10: 'Orphanet:98553',   # developmental retinal disorder
            11: 'DOID:5679',        # retinal disorder
            12: 'Orphanet:90771',   # Disorder of Sex Development
            #  - what to do about this one?
            13: 'MP:0008762',       # embryonic lethal
            # - not sure what to do with this
            14: None,               # blood group
            # FIXME make me more specific
            15: 'DOID:2218',        # intrinsic platelet disorder
            # FIXME make me more specific
            16: 'DOID:2218',        # extrinsic platelet disorder
            17: None  # transgenic ???
        }

        disease_id = None
        if category_num is not None and int(category_num) in category_map:
            disease_id = category_map.get(int(category_num))
            logger.info(
                "Found %s for category %s", str(disease_id), str(category_num))
        else:
            logger.info(
                "There's a group category I don't know anything about: %s",
                str(category_num))

        return disease_id

    def _process_group_mpo_row(self, row):
        """
        Make OMIA to MP associations
        :param row:
        :return:
        """
        omia_id = 'OMIA:'+row['omia_id']
        mpo_num = int(row['MPO_no'])
        mpo_id = 'MP:'+str(mpo_num).zfill(7)

        assoc = D2PAssoc(self.name, omia_id, mpo_id)
        assoc.add_association_to_graph(self.g)

        return

    def clean_up_omim_genes(self):
        omim = OMIM()
        # get all the omim ids
        allomimids = set()
        for omia in self.omia_omim_map:
            allomimids.update(self.omia_omim_map[omia])

        entries_that_are_phenotypes = omim.process_entries(
            list(allomimids), filter_keep_phenotype_entry_ids, None, None)
        logger.info(
            "Filtered out %d/%d entries that are genes or features",
            len(allomimids)-len(entries_that_are_phenotypes), len(allomimids))

        # now iterate again and remove those non-phenotype ids
        removed_count = 0
        for omia in self.omia_omim_map:
            ids = self.omia_omim_map[omia]
            cleanids = set()
            for i in ids:
                if i in entries_that_are_phenotypes:
                    cleanids.add(i)
                else:
                    removed_count += 1  # keep track of how many we've removed
            self.omia_omim_map[omia] = cleanids

        logger.info(
            "Removed %d omim ids from the omia-to-omim map", removed_count)

        return

    def _make_internal_id(self, prefix, key):

        iid = '_'+''.join(('omia', prefix, 'key', str(key)))
        if self.nobnodes:
            iid = ':'+iid

        return iid

    def make_breed_id(self, key):
        breed_id = 'OMIA-breed:'+str(key)

        return breed_id

    @staticmethod
    def _get_omia_id_from_phene_id(phene_id):
        omia_id = None
        if phene_id is not None:
            m = re.match(r'OMIA:\d+', str(phene_id))
            if m:
                omia_id = m.group(0)

        return omia_id

    @staticmethod
    def _map_inheritance_term_id(inheritance_symbol):

        inherit_map = {
            'A':  None,  # Autosomal
            'ACD': 'GENO:0000143',  # Autosomal co-dominant
            'ADV': None,  # autosomal dominant with variable expressivity
            'AID': 'GENO:0000259',  # autosomal incompletely dominant
            'ASD': 'GENO:0000145',  # autosomal semi-dominant
            # autosomal recessive, semi-lethal
            # using generic autosomal recessive
            'ASL': 'GENO:0000150',
            'D': 'GENO:0000147',  # autosomal dominant
            'M': None,  # multifactorial
            'MAT': None,  # Maternal
            # probably autosomal recessive
            # using generic autosomal recessive
            'PR':  'GENO:0000150',
            'R': 'GENO:0000150',  # Autosomal Recessive
            # Recessive Embryonic Lethal
            # using plain recessive
            'REL': 'GENO:0000148',
            # Autosomal Recessive Lethal
            # using plain autosomal recessive
            'RL': 'GENO:0000150',
            'S': 'GENO:0000146',  # Sex-linked   <--using allosomal dominant
            'SLi': None,  # Sex-limited
            'UD': 'GENO:0000144',  # Dominant
            'X': None,  # x-linked    # HP:0001417 ?
            # X-linked Dominant     <-- temp using allosomal dominant  FIXME
            'XLD': 'GENO:0000146',
            # X-linked Recessive    <-- temp using allosomal recessive  FIXME
            'XLR': 'GENO:0000149',
            'Y': None,  # Y-linked
            'Z': None,  # Z-linked
            # Z-linked recessive    <-- temp using allosomal recessive  FIXME
            'ZR': 'GENO:0000149',
            '999': None,  # Z-linked incompletely dominant
        }

        inheritance_id = inherit_map.get(inheritance_symbol)
        if inheritance_id is None and inheritance_symbol is not None:
            logger.warning(
                "No inheritance id is mapped for %s", inheritance_symbol)

        return inheritance_id

    def getTestSuite(self):
        import unittest
        from tests.test_omia import OMIATestCase

        test_suite = unittest.TestLoader().loadTestsFromTestCase(OMIATestCase)

        return test_suite
Пример #11
0
    def process_gaf(self, file, limit, id_map=None):

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())
        geno = Genotype(g)
        logger.info("Processing Gene Associations from %s", file)
        line_counter = 0

        zfin = wbase = None
        if 7955 in self.tax_ids:
            zfin = ZFIN()
        elif 6239 in self.tax_ids:
            wbase = WormBase()

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue
                (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol,
                 with_or_from, aspect, gene_name, gene_synonym, object_type,
                 taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (db == '' or gene_num == '' or gene_symbol == '' or
                        go_id == '' or ref == '' or eco_symbol == '' or
                        aspect == '' or object_type == '' or taxon == '' or
                        date == '' or assigned_by == ''):
                    logger.error(
                        "Missing required part of annotation " +
                        "on row %d:\n"+'\t'.join(row),
                        line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                db = self.clean_db_prefix(db)
                uniprotid = None
                gene_id = None
                if db == 'UniProtKB':
                    mapped_ids = id_map.get(gene_num)
                    if id_map is not None and mapped_ids is not None:
                        if len(mapped_ids) == 1:
                            gene_id = mapped_ids[0]
                            uniprotid = ':'.join((db, gene_num))
                            gene_num = re.sub(r'\w+\:', '', gene_id)
                        elif len(mapped_ids) > 1:
                            # logger.warning(
                            #   "Skipping gene id mapped for >1 gene %s -> %s",
                            #    gene_num, str(mapped_ids))
                            continue
                    else:
                        continue
                elif db == 'MGI':
                    gene_num = re.sub(r'MGI:', '', gene_num)
                    gene_id = ':'.join((db, gene_num))
                    gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id)
                else:
                    gene_id = ':'.join((db, gene_num))

                if self.testMode \
                        and not(
                            re.match(r'NCBIGene', gene_id) and
                            int(gene_num) in self.test_ids):
                    continue

                gu.addClassToGraph(g, gene_id, gene_symbol)
                if gene_name != '':
                    gu.addDescription(g, gene_id, gene_name)
                if gene_synonym != '':
                    for s in re.split(r'\|', gene_synonym):
                        gu.addSynonym(g, gene_id, s.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    logger.info(">1 taxon (%s) on line %d.  skipping", taxon,
                                line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(self.name)

                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                eco_id = self.map_go_evidence_code_to_eco(eco_symbol)
                if eco_id is not None:
                    assoc.add_evidence(eco_id)

                refs = re.split(r'\|', ref)
                for r in refs:
                    r = r.strip()
                    if r != '':
                        prefix = re.split(r':', r)[0]
                        r = re.sub(prefix, self.clean_db_prefix(prefix), r)
                        r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                        ref = Reference(r)
                        if re.match(r'PMID', r):
                            ref_type = Reference.ref_types['journal_article']
                            ref.setType(ref_type)
                        ref.addRefToGraph(g)
                        assoc.add_source(r)

                # TODO add the source of the annotations from assigned by?

                aspect_rel_map = {
                    'P': gu.object_properties['involved_in'],  # involved in
                    'F': gu.object_properties['enables'],  # enables
                    'C': gu.object_properties['part_of']  # part of
                }

                if aspect not in aspect_rel_map:
                    logger.error("Aspect not recognized: %s", aspect)

                rel = aspect_rel_map.get(aspect)
                if aspect == 'F' and re.search(r'contributes_to', qualifier):
                    rel = gu.object_properties['contributes_to']
                assoc.set_relationship(rel)
                if uniprotid is not None:
                    assoc.set_description('Mapped from '+uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used

                assoc.add_association_to_graph(g)

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id+'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or \
                                re.match(
                                    r'(UniProtKB|WBPhenotype|InterPro|HGNC)',
                                    i):
                            logger.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s",
                                uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i, self.nobnodes)
                            geno.addReagentTargetedGene(i, gene_id,
                                                        targeted_gene_id)
                            # TODO PYLINT why is this:
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(self.name, targeted_gene_id,
                                             phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = \
                                wbase.make_reagent_targeted_gene_id(
                                    gene_id, i, self.nobnodes)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(
                                self.name, targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(self.name, i, phenotypeid)
                        for r in refs:
                            r = r.strip()
                            if r != '':
                                prefix = re.split(r':', r)[0]
                                r = re.sub(
                                    prefix, self.clean_db_prefix(prefix), r)
                                r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                                assoc.add_source(r)
                                # experimental phenotypic evidence
                                assoc.add_evidence("ECO:0000059")
                        assoc.add_association_to_graph(g, self.nobnodes)
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return
Пример #12
0
class Dataset:
    """
     this will produce the metadata about a dataset
     following the example laid out here:
     http://htmlpreview.github.io/?
     https://github.com/joejimbo/HCLSDatasetDescriptions/blob/master/Overview.html#appendix_1
     (mind the wrap)

    """

    namespaces = {
        'dctypes': 'http://purl.org/dc/dcmitype/',
        'pav': 'http://purl.org/pav/',
        'dcat': 'http://www.w3.org/ns/dcat#'
    }

    core_bindings = {'rdf': RDF, 'foaf': FOAF, 'xsd': XSD, 'dct': DCTERMS}

    def __init__(self, identifier, title, url, description=None,
                 license_url=None, data_rights=None):
        DCTYPES = Namespace(self.namespaces['dctypes'])
        self.gu = GraphUtils(curie_map.get())
        self.identifier = URIRef(':'+identifier)
        self.version = None
        self.date_issued = None
        self.date_accessed = None
        self.citation = set()
        self.set_access_date()
        self.license = license_url
        self.graph = Graph()
        self.load_bindings()
        self.graph.add((self.identifier, RDF['type'], DCTYPES['Dataset']))
        self.graph.add((self.identifier, DCTERMS['title'], Literal(title)))
        self.graph.add(
            (self.identifier, DCTERMS['identifier'], Literal(identifier)))
        self.graph.add((self.identifier, FOAF['page'], URIRef(url)))
        self.dipperized_version = URIRef('monarch'+str(self.date_accessed))
        # maybe in the future add the logo here:
        # schemaorg:logo <http://www.ebi.ac.uk/rdf/sites/ebi.ac.uk.rdf/files/resize/images/rdf/chembl_service_logo-146x48.gif> .

        # TODO add the licence info
        # FIXME:Temporarily making this in IF statement,
        #  can revert after all current resources are updated.
        if license_url is not None:
            self.graph.add(
                (self.identifier, DCTERMS['license'], URIRef(license_url)))
        else:
            logger.debug('No license provided.')
        if data_rights is not None:
            self.graph.add(
                (self.identifier, DCTERMS['rights'], Literal(data_rights)))
        else:
            logger.debug('No rights provided.')

        if description is not None:
            self.gu.addDescription(self.graph, self.identifier, description)
        return

    def load_bindings(self):
        for k in self.core_bindings:
            v = self.core_bindings[k]
            self.graph.bind(k, v)

        for k in self.namespaces.keys():
            v = self.namespaces[k]
            self.graph.bind(k, Namespace(v))

        return

    def setVersion(self, date_issued, version_id=None):
        """
        Legacy function...  should use the other set_* for version and date

        # TODO set as deprecated
        :param date_issued:
        :param version_id:
        :return:

        """

        if date_issued is not None:
            self.set_date_issued(date_issued)
        elif version_id is not None:
            # this shouldn't happen
            self.set_version_by_num(version_id)
        else:
            logger.error("No date or version set!")
            # TODO throw error
            return

        if version_id is not None:
            self.set_version_by_num(version_id)
        else:
            self.set_version_by_date(date_issued)

        logger.info("set version to %s", self.version)

        return

    def set_date_issued(self, date_issued):

        self.date_issued = date_issued
        self.graph.add(
            (self.identifier, DCTERMS['issued'], Literal(date_issued)))
        logger.info("setting date to %s", date_issued)

        return

    def set_version_by_date(self, date_issued=None):
        """
        This will set the version by the date supplied,
        the date already stored in the dataset description,
        or by the download date (today)
        :param date_issued:
        :return:
        """

        if date_issued is not None:
            d = date_issued
        elif self.date_issued is not None:
            d = self.date_issued
        else:
            d = self.date_accessed
            logger.info(
                "No date supplied for setting version; "
                "using download timestamp for date_issued")

        logger.info("setting version by date")
        self.set_version_by_num(d)

        return

    def set_version_by_num(self, version_num):
        PAV = Namespace(self.namespaces['pav'])

        self.version = URIRef(self.identifier+version_num)
        self.graph.add((self.version, DCTERMS['isVersionOf'], self.identifier))
        self.graph.add((self.version, PAV['version'], Literal(version_num)))

        logger.info("setting version to %s", self.version)

        # set the monarch-generated-version of the resource-version
        # TODO sync this up with the ontology version
        if version_num != self.date_accessed:
            self.dipperized_version = URIRef('monarch'+str(self.date_accessed))
            self.graph.add(
                (self.dipperized_version, DCTERMS['isVersionOf'],
                 self.version))
            self.graph.add(
                (self.dipperized_version, PAV['version'],
                 Literal(self.date_accessed)))
            self.graph.add(
                (self.dipperized_version, DCTERMS['issued'],
                 Literal(self.date_accessed, datatype=XSD.dateTime)))

        return

    def set_access_date(self):

        t = datetime.now()
        t_string = t.strftime("%Y-%m-%d-%H-%M")
        d = t_string
        self.date_accessed = d
        logger.info("Setting date of access to %s", self.date_accessed)

        return

    def setFileAccessUrl(self, url):
        DCAT = Namespace(self.namespaces['dcat'])
        self.graph.add((self.identifier, DCAT['accessURL'], URIRef(url)))
        return

    def getGraph(self):
        return self.graph

    def set_license(self, license):
        self.license = license
        return

    def get_license(self):

        return self.license

    def set_citation(self, citation_id):

        self.citation.add(citation_id)
        # TODO
        # gu.addTriple(self.identifier, 'cito:citeAsAuthority', citation_id)

        return