Пример #1
0
    def _process_genes(self, taxid, limit=None):
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)

        raw = '/'.join((self.rawdir, self.files[taxid]['file']))
        line_counter = 0
        logger.info("Processing Ensembl genes for tax %s", taxid)
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t')
            for row in filereader:
                if len(row) < 4:
                    logger.error("Data error for file %s", raw)
                    return
                (ensembl_gene_id, external_gene_name, description,
                 gene_biotype, entrezgene) = row[0:5]

                # in the case of human genes, we also get the hgnc id,
                # and is the last col
                if taxid == '9606':
                    hgnc_id = row[5]
                else:
                    hgnc_id = None

                if self.testMode and entrezgene != '' \
                        and int(entrezgene) not in self.gene_ids:
                    continue

                line_counter += 1
                gene_id = 'ENSEMBL:'+ensembl_gene_id
                if description == '':
                    description = None
                gene_type_id = self._get_gene_type(gene_biotype)
                gene_type_id = None
                gu.addClassToGraph(
                    g, gene_id, external_gene_name, gene_type_id, description)

                if entrezgene != '':
                    gu.addEquivalentClass(g, gene_id, 'NCBIGene:'+entrezgene)
                if hgnc_id is not None and hgnc_id != '':
                    gu.addEquivalentClass(g, gene_id, hgnc_id)
                geno.addTaxon('NCBITaxon:'+taxid, gene_id)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
        gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
        gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        return
Пример #2
0
 def _process_gene_row(self, row):
     model = Model(self.graph)
     geno = Genotype(self.graph)
     if self.test_mode and row['gene_id'] not in self.test_ids['gene']:
         return
     gene_id = 'NCBIGene:' + str(row['gene_id'])
     self.id_hash['gene'][row['gene_id']] = gene_id
     gene_label = row['symbol']
     self.label_hash[gene_id] = gene_label
     tax_id = 'NCBITaxon:' + str(row['gb_species_id'])
     if row['gene_type'] is not None:
         gene_type_id = self.resolve(row['gene_type'])
         model.addClassToGraph(gene_id, gene_label, gene_type_id)
     geno.addTaxon(tax_id, gene_id)
Пример #3
0
    def process_gene_ids(self, limit):
        raw = '/'.join((self.rawdir, self.files['gene_ids']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        logger.info("Processing: %s", self.files['gene_ids']['file'])
        line_counter = 0
        geno = Genotype(g)
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter=',',
                quotechar='\"')
            for row in filereader:
                line_counter += 1
                (taxon_num,
                 gene_num,
                 gene_symbol,
                 gene_synonym,
                 live,
                 gene_type) = row
                # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                taxon_id = 'NCBITaxon:'+taxon_num
                gene_id = 'WormBase:'+gene_num
                if gene_symbol == '':
                    gene_symbol = gene_synonym
                if gene_symbol == '':
                    gene_symbol = None
                model.addClassToGraph(
                    gene_id, gene_symbol, Genotype.genoparts['gene'])
                if live == 'Dead':
                    model.addDeprecatedClass(gene_id)
                geno.addTaxon(taxon_id, gene_id)
                if gene_synonym != '' and gene_synonym is not None:
                    model.addSynonym(gene_id, gene_synonym)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Пример #4
0
    def process_gene_ids(self, limit):
        src_key = 'gene_ids'
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        graph = self.graph
        model = Model(graph)
        geno = Genotype(graph)
        col = self.files[src_key]['columns']
        LOG.info("Processing: %s", self.files[src_key]['file'])

        with gzip.open(raw, 'rb') as csvfile:
            reader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                delimiter=',',
                                quotechar='\"')
            # no header row to check
            collen = len(col)
            for row in reader:
                if len(row) != collen:
                    LOG.error('In %s line %i expected %i colums but got %s.',
                              self.files[src_key]['file'], reader.line_num,
                              collen, row)
                    pass
                taxon_num = row[col.index('taxon_num')]
                gene_num = row[col.index('gene_num')]
                gene_symbol = row[col.index('gene_symbol')]
                gene_synonym = row[col.index('gene_synonym')]
                live = row[col.index('live')]
                # gene_type = row[col.index('gene_type')]
                # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene

                taxon_curie = 'NCBITaxon:' + taxon_num
                gene_curie = 'WormBase:' + gene_num

                if gene_symbol == '':
                    gene_symbol = gene_synonym  # these are not the same in my book tec.
                if gene_symbol == '':
                    gene_symbol = None
                model.addClassToGraph(gene_curie, gene_symbol,
                                      self.globaltt['gene'])
                if live == 'Dead':
                    model.addDeprecatedClass(gene_curie,
                                             old_id_category=blv.terms['Gene'])
                geno.addTaxon(taxon_curie, gene_curie)
                if gene_synonym is not None and gene_synonym != '':
                    model.addSynonym(gene_curie, gene_synonym)

                if limit is not None and reader.line_num > limit:
                    break
Пример #5
0
    def process_gene_ids(self, limit):
        raw = '/'.join((self.rawdir, self.files['gene_ids']['file']))

        if self.testMode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        logger.info("Processing: %s", self.files['gene_ids']['file'])
        line_counter = 0
        geno = Genotype(graph)
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter=',',
                                    quotechar='\"')
            for row in filereader:
                line_counter += 1
                (taxon_num, gene_num, gene_symbol, gene_synonym, live,
                 gene_type) = row
                # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                taxon_id = 'NCBITaxon:' + taxon_num
                gene_id = 'WormBase:' + gene_num
                if gene_symbol == '':
                    gene_symbol = gene_synonym
                if gene_symbol == '':
                    gene_symbol = None
                model.addClassToGraph(gene_id, gene_symbol,
                                      self.globaltt['gene'])
                if live == 'Dead':
                    model.addDeprecatedClass(gene_id)
                geno.addTaxon(taxon_id, gene_id)
                if gene_synonym != '' and gene_synonym is not None:
                    model.addSynonym(gene_id, gene_synonym)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Пример #6
0
    def _get_gene_info(self, limit):
        """
        Currently loops through the gene_info file and
        creates the genes as classes, typed with SO.  It will add their label,
        any alternate labels as synonyms, alternate ids as equivlaent classes.
        HPRDs get added as protein products.
        The chromosome and chr band get added as blank node regions,
        and the gene is faldo:located
        on the chr band.
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)
        model = Model(g)

        # not unzipping the file
        logger.info("Processing 'Gene Info' records")
        line_counter = 0
        gene_info = '/'.join((self.rawdir, self.files['gene_info']['file']))
        logger.info("FILE: %s", gene_info)
        # Add taxa and genome classes for those in our filter
        for tax_num in self.tax_ids:
            tax_id = ':'.join(('NCBITaxon', str(tax_num)))
            # tax label can get added elsewhere
            geno.addGenome(tax_id, str(tax_num))
            # label added elsewhere
            model.addClassToGraph(tax_id, None)
        with gzip.open(gene_info, 'rb') as f:
            row = f.readline().decode().strip().split('\t')
            logger.info("Header has %i columns", len(row))
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chrom,
                 map_loc, desc, gtype, authority_symbol, name,
                 nomenclature_status, other_designations, modification_date,
                 feature_type) = line.split('\t')

                # ##set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #           or (self.filter == 'geneids' and \
                #               (int(gene_num) not in self.gene_ids))):
                #         continue
                # #### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1

                gene_id = ':'.join(('NCBIGene', gene_num))
                tax_id = ':'.join(('NCBITaxon', tax_num))
                gene_type_id = self.map_type_of_gene(gtype.strip())

                if symbol == 'NEWENTRY':
                    label = None
                else:
                    label = symbol
                # sequence feature, not a gene
                if gene_type_id == 'SO:0000110':
                    self.class_or_indiv[gene_id] = 'I'
                else:
                    self.class_or_indiv[gene_id] = 'C'

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    continue

                if self.class_or_indiv[gene_id] == 'C':
                    model.addClassToGraph(gene_id, label, gene_type_id, desc)
                    # NCBI will be the default leader,
                    # so we will not add the leader designation here.
                else:
                    model.addIndividualToGraph(gene_id, label, gene_type_id,
                                               desc)
                    # in this case, they aren't genes.
                    # so we want someone else to be the leader.

                if name != '-':
                    model.addSynonym(gene_id, name)
                if synonyms.strip() != '-':
                    for s in synonyms.split('|'):
                        model.addSynonym(
                            gene_id, s.strip(),
                            Assoc.annotation_properties['hasRelatedSynonym'])
                if other_designations.strip() != '-':
                    for s in other_designations.split('|'):
                        model.addSynonym(
                            gene_id, s.strip(),
                            Assoc.annotation_properties['hasRelatedSynonym'])
                if xrefs.strip() != '-':
                    self._add_gene_equivalencies(xrefs, gene_id, tax_num)

                # edge cases of id | symbol | chr | map_loc:
                # 263     AMD1P2    X|Y  with   Xq28 and Yq12
                # 438     ASMT      X|Y  with   Xp22.3 or Yp11.3    # in PAR
                # no idea why there's two bands listed - possibly 2 assemblies
                # 419     ART3      4    with   4q21.1|4p15.1-p14
                # 28227   PPP2R3B   X|Y  Xp22.33; Yp11.3            # in PAR
                # this is of "unknown" type == susceptibility
                # 619538  OMS     10|19|3 10q26.3;19q13.42-q13.43;3p25.3
                # unlocated scaffold
                # 101928066       LOC101928066    1|Un    -\
                # mouse --> 2C3
                # 11435   Chrna1  2       2 C3|2 43.76 cM
                # mouse --> 11B1.1
                # 11548   Adra1b  11      11 B1.1|11 25.81 cM
                # 11717   Ampd3   7       7 57.85 cM|7 E2-E3        # mouse
                # 14421   B4galnt1        10      10 D3|10 74.5 cM  # mouse
                # 323212  wu:fb92e12      19|20   -                 # fish
                # 323368  ints10  6|18    -                         # fish
                # 323666  wu:fc06e02      11|23   -                 # fish

                # feel that the chr placement can't be trusted in this table
                # when there is > 1 listed
                # with the exception of human X|Y,
                # we will only take those that align to one chr

                # FIXME remove the chr mapping below
                # when we pull in the genomic coords
                if str(chrom) != '-' and str(chrom) != '':
                    if re.search(r'\|', str(chrom)) and \
                            str(chrom) not in ['X|Y', 'X; Y']:
                        # means that there's uncertainty in the mapping.
                        # so skip it
                        # TODO we'll need to figure out how to deal with
                        # >1 loc mapping
                        logger.info(
                            '%s is non-uniquely mapped to %s.' +
                            ' Skipping for now.', gene_id, str(chr))
                        continue
                        # X|Y	Xp22.33;Yp11.3

                    # if(not re.match(
                    #        r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())):
                    #    print('odd chr=',str(chr))
                    if str(chrom) == 'X; Y':
                        chrom = 'X|Y'  # rewrite the PAR regions for processing
                    # do this in a loop to allow PAR regions like X|Y
                    for c in re.split(r'\|', str(chrom)):
                        # assume that the chromosome label is added elsewhere
                        geno.addChromosomeClass(c, tax_id, None)
                        mychrom = makeChromID(c, tax_num, 'CHR')
                        # temporarily use taxnum for the disambiguating label
                        mychrom_syn = makeChromLabel(c, tax_num)
                        model.addSynonym(mychrom, mychrom_syn)
                        band_match = re.match(r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$',
                                              map_loc)
                        if band_match is not None and \
                                len(band_match.groups()) > 0:
                            # if tax_num != '9606':
                            #     continue
                            # this matches the regular kind of chrs,
                            # so make that kind of band
                            # not sure why this matches?
                            #   chrX|Y or 10090chr12|Un"
                            # TODO we probably need a different regex
                            # per organism
                            # the maploc_id already has the numeric chromosome
                            # in it, strip it first
                            bid = re.sub(r'^' + c, '', map_loc)
                            # the generic location (no coordinates)
                            maploc_id = makeChromID(c + bid, tax_num, 'CHR')
                            # print(map_loc,'-->',bid,'-->',maploc_id)
                            # Assume it's type will be added elsewhere
                            band = Feature(g, maploc_id, None, None)
                            band.addFeatureToGraph()
                            # add the band as the containing feature
                            g.addTriple(
                                gene_id,
                                Feature.object_properties['is_subsequence_of'],
                                maploc_id)
                        else:
                            # TODO handle these cases: examples are:
                            # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24,
                            # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1,
                            # 12cen-q21,22q13.3|22q13.3
                            logger.debug('not regular band pattern for %s: %s',
                                         gene_id, map_loc)
                            # add the gene as a subsequence of the chromosome
                            g.addTriple(
                                gene_id,
                                Feature.object_properties['is_subsequence_of'],
                                mychrom)

                geno.addTaxon(tax_id, gene_id)

        return
Пример #7
0
    def process_gaf(self, file, limit, id_map=None):

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        geno = Genotype(g)
        logger.info("Processing Gene Associations from %s", file)
        line_counter = 0

        if 7955 in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        elif 6239 in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue
                (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol,
                 with_or_from, aspect, gene_name, gene_synonym, object_type,
                 taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (db == '' or gene_num == '' or gene_symbol == ''
                        or go_id == '' or ref == '' or eco_symbol == ''
                        or aspect == '' or object_type == '' or taxon == ''
                        or date == '' or assigned_by == ''):
                    logger.error(
                        "Missing required part of annotation " +
                        "on row %d:\n" + '\t'.join(row), line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                db = self.clean_db_prefix(db)
                uniprotid = None
                gene_id = None
                if db == 'UniProtKB':
                    mapped_ids = id_map.get(gene_num)
                    if id_map is not None and mapped_ids is not None:
                        if len(mapped_ids) == 1:
                            gene_id = mapped_ids[0]
                            uniprotid = ':'.join((db, gene_num))
                            gene_num = re.sub(r'\w+\:', '', gene_id)
                        elif len(mapped_ids) > 1:
                            # logger.warning(
                            #   "Skipping gene id mapped for >1 gene %s -> %s",
                            #    gene_num, str(mapped_ids))
                            continue
                    else:
                        continue
                elif db == 'MGI':
                    gene_num = re.sub(r'MGI:', '', gene_num)
                    gene_id = ':'.join((db, gene_num))
                    gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id)
                else:
                    gene_id = ':'.join((db, gene_num))

                if self.testMode \
                        and not(
                            re.match(r'NCBIGene', gene_id) and
                            int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for s in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, s.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    logger.info(">1 taxon (%s) on line %d.  skipping", taxon,
                                line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(g, self.name)

                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                eco_id = self.map_go_evidence_code_to_eco(eco_symbol)
                if eco_id is not None:
                    assoc.add_evidence(eco_id)

                refs = re.split(r'\|', ref)
                for r in refs:
                    r = r.strip()
                    if r != '':
                        prefix = re.split(r':', r)[0]
                        r = re.sub(prefix, self.clean_db_prefix(prefix), r)
                        r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                        ref = Reference(g, r)
                        if re.match(r'PMID', r):
                            ref_type = Reference.ref_types['journal_article']
                            ref.setType(ref_type)
                        ref.addRefToGraph()
                        assoc.add_source(r)

                # TODO add the source of the annotations from assigned by?

                aspect_rel_map = {
                    'P': model.object_properties['involved_in'],  # involved in
                    'F': model.object_properties['enables'],  # enables
                    'C': model.object_properties['part_of']  # part of
                }

                if aspect not in aspect_rel_map:
                    logger.error("Aspect not recognized: %s", aspect)

                rel = aspect_rel_map.get(aspect)
                if aspect == 'F' and re.search(r'contributes_to', qualifier):
                    rel = model.object_properties['contributes_to']
                assoc.set_relationship(rel)
                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used

                assoc.add_association_to_graph()

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or \
                                re.match(
                                    r'(UniProtKB|WBPhenotype|InterPro|HGNC)',
                                    i):
                            logger.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s", uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this:
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(g, self.name, targeted_gene_id,
                                             phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = \
                                wbase.make_reagent_targeted_gene_id(
                                    gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(g, self.name, targeted_gene_id,
                                             phenotypeid)
                        else:
                            assoc = G2PAssoc(g, self.name, i, phenotypeid)
                        for r in refs:
                            r = r.strip()
                            if r != '':
                                prefix = re.split(r':', r)[0]
                                r = re.sub(prefix,
                                           self.clean_db_prefix(prefix), r)
                                r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                                assoc.add_source(r)
                                # experimental phenotypic evidence
                                assoc.add_evidence("ECO:0000059")
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return
Пример #8
0
    def _process_qtls_genomic_location(
            self, raw, txid, build_id, build_label, common_name, limit=None):
        """
        This method

        Triples created:

        :param limit:
        :return:
        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        line_counter = 0
        geno = Genotype(graph)
        # assume that chrs get added to the genome elsewhere

        taxon_curie = 'NCBITaxon:' + txid
        eco_id = self.globaltt['quantitative trait analysis evidence']
        LOG.info("Processing QTL locations for %s from %s", taxon_curie, raw)
        with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            for row in reader:
                line_counter += 1
                if re.match(r'^#', ' '.join(row)):
                    continue

                (chromosome, qtl_source, qtl_type, start_bp, stop_bp, frame, strand,
                 score, attr) = row
                example = '''
Chr.Z   Animal QTLdb    Production_QTL  33954873      34023581...
QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234;
trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass";
MO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian";
Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52";
Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01
                '''
                str(example)
                # make dictionary of attributes
                # keys are:
                # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,FlankMarkers,
                # VTO_name,Map_Type,Significance,P-value,Model,
                # Test_Base,Variance, Bayes-value,PTO_name,gene_IDsrc,peak_cM,
                # CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect,
                # Dominance_Effect,Likelihood_Ratio,LS-means,Breed,
                # trait (duplicate with Name),Variance,Bayes-value,
                # F-Stat,LOD-score,Additive_Effect,Dominance_Effect,
                # Likelihood_Ratio,LS-means

                # deal with poorly formed attributes
                if re.search(r'"FlankMarkers";', attr):
                    attr = re.sub(r'FlankMarkers;', '', attr)
                attr_items = re.sub(r'"', '', attr).split(";")
                bad_attrs = set()
                for attributes in attr_items:
                    if not re.search(r'=', attributes):
                        # remove this attribute from the list
                        bad_attrs.add(attributes)

                attr_set = set(attr_items) - bad_attrs
                attribute_dict = dict(item.split("=") for item in attr_set)

                qtl_num = attribute_dict.get('QTL_ID')
                if self.test_mode and int(qtl_num) not in self.test_ids:
                    continue
                # make association between QTL and trait based on taxon

                qtl_id = common_name + 'QTL:' + str(qtl_num)
                model.addIndividualToGraph(qtl_id, None, self.globaltt['QTL'])
                geno.addTaxon(taxon_curie, qtl_id)

                #
                trait_id = 'AQTLTrait:' + attribute_dict.get('trait_ID')

                # if pub is in attributes, add it to the association
                pub_id = None
                if 'PUBMED_ID' in attribute_dict.keys():
                    pub_id = attribute_dict.get('PUBMED_ID')
                    if re.match(r'ISU.*', pub_id):
                        pub_id = 'AQTLPub:' + pub_id.strip()
                        reference = Reference(graph, pub_id)
                    else:
                        pub_id = 'PMID:' + pub_id.strip()
                        reference = Reference(
                            graph, pub_id, self.globaltt['journal article'])
                    reference.addRefToGraph()

                # Add QTL to graph
                assoc = G2PAssoc(
                    graph, self.name, qtl_id, trait_id,
                    self.globaltt['is marker for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)
                if 'P-value' in attribute_dict.keys():
                    scr = re.sub(r'<', '', attribute_dict.get('P-value'))
                    if ',' in scr:
                        scr = re.sub(r',', '.', scr)
                    if scr.isnumeric():
                        score = float(scr)
                        assoc.set_score(score)

                assoc.add_association_to_graph()
                # TODO make association to breed
                # (which means making QTL feature in Breed background)

                # get location of QTL
                chromosome = re.sub(r'Chr\.', '', chromosome)
                chrom_id = makeChromID(chromosome, taxon_curie, 'CHR')

                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                qtl_feature = Feature(graph, qtl_id, None, self.globaltt['QTL'])
                if start_bp == '':
                    start_bp = None
                qtl_feature.addFeatureStartLocation(
                    start_bp, chrom_in_build_id, strand,
                    [self.globaltt['FuzzyPosition']])
                if stop_bp == '':
                    stop_bp = None
                qtl_feature.addFeatureEndLocation(
                    stop_bp, chrom_in_build_id, strand,
                    [self.globaltt['FuzzyPosition']])
                qtl_feature.addTaxonToFeature(taxon_curie)
                qtl_feature.addFeatureToGraph()

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        # LOG.warning("Bad attribute flags in this file")  # what does this even mean??
        LOG.info("Done with QTL genomic mappings for %s", taxon_curie)
        return
Пример #9
0
    def _parse_g2p_file(self, limit=None):
        """
        Parse gene to XPO file, currently custom for Monarch
        :param limit:
        :return:
        """
        src_key = 'g2p_assertions'
        geno = Genotype(self.graph)
        model = Model(self.graph)

        columns = self.files[src_key]['columns']
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))

        LOG.info("Processing Gene to XPO associations")

        with open(raw, 'r', encoding="utf8") as csvfile:
            reader = csv.reader(csvfile)

            # File has headers
            row = next(reader)
            if not self.check_fileheader(columns, row):
                pass

            for row in reader:

                gene = row[columns.index('SUBJECT')]
                gene_label = row[columns.index('SUBJECT_LABEL')]
                gene_taxon = row[columns.index('SUBJECT_TAXON')]
                #gene_taxon_label = row[columns.index('SUBJECT_TAXON_LABEL')]
                phenotype_curie = row[columns.index('OBJECT')]
                #phenotype_label = row[columns.index('OBJECT_LABEL')]
                relation = row[columns.index('RELATION')]
                #relation_label = row[columns.index('RELATION_LABEL')]
                evidence = row[columns.index('EVIDENCE')]
                #evidence_label = row[columns.index('EVIDENCE_LABEL')]
                source = row[columns.index('SOURCE')]
                #is_defined_by = row[columns.index('IS_DEFINED_BY')]
                #qualifier = row[columns.index('QUALIFIER')]

                gene_curie = 'Xenbase:' + gene
                relation_curie = relation.replace('_', ':')

                geno.addGene(gene_curie, gene_label)
                geno.addTaxon(gene_taxon, gene_curie)

                assoc = G2PAssoc(
                    self.graph,
                    self.name,
                    entity_id=gene_curie,
                    phenotype_id=phenotype_curie,
                    rel=relation_curie
                )

                if evidence:
                    assoc.add_evidence(evidence)

                if source:
                    model.addType(source, self.globaltt['journal article'])
                    assoc.add_source(source)

                assoc.add_association_to_graph()

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
Пример #10
0
    def _get_gene_info(self, limit):
        """
        Currently loops through the gene_info file and
        creates the genes as classes, typed with SO.  It will add their label,
        any alternate labels as synonyms, alternate ids as equivalent classes.
        HPRDs get added as protein products.
        The chromosome and chr band get added as blank node regions,
        and the gene is faldo:located
        on the chr band.
        :param limit:
        :return:

        """
        src_key = 'gene_info'
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        geno = Genotype(graph)
        model = Model(graph)

        # not unzipping the file
        LOG.info("Processing 'Gene Info' records")
        line_counter = 0
        gene_info = '/'.join((self.rawdir, self.files[src_key]['file']))
        LOG.info("FILE: %s", gene_info)
        LOG.info('Add taxa and genome classes for those in our filter')

        band_regex = re.compile(r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$')
        for tax_num in self.tax_ids:
            tax_curie = ':'.join(('NCBITaxon', tax_num))
            # tax label can get added elsewhere
            geno.addGenome(tax_curie, tax_num)
            # label added elsewhere
            model.addClassToGraph(tax_curie, None)

        col = self.files[src_key]['columns']
        LOG.info('Begin reading & parsing')

        with gzip.open(gene_info, 'rb') as tsv:
            row = tsv.readline().decode().strip().split('\t')
            row[0] = row[0][1:]  # strip comment char
            if not self.check_fileheader(col, row):
                pass

            for line in tsv:
                line = line.strip()
                line_counter += 1
                if line[0] == '#':  # skip comments
                    continue
                row = line.decode().strip().split('\t')

                # ##set filter=None in init if you don't want to have a filter
                # if self.id_filter is not None:
                #     if ((self.id_filter == 'taxids' and \
                #          (tax_num not in self.tax_ids))
                #           or (self.id_filter == 'geneids' and \
                #               (int(gene_num) not in self.gene_ids))):
                #         continue
                # #### end filter

                tax_num = row[col.index('tax_id')]
                gene_num = row[col.index('GeneID')]
                symbol = row[col.index('Symbol')]
                # = row[col.index('LocusTag')]
                synonyms = row[col.index('Synonyms')].strip()
                dbxrefs = row[col.index('dbXrefs')].strip()
                chrom = row[col.index('chromosome')].strip()
                map_loc = row[col.index('map_location')].strip()
                desc = row[col.index('description')]
                gtype = row[col.index('type_of_gene')].strip()
                # = row[col.index('Symbol_from_nomenclature_authority')]
                name = row[col.index('Full_name_from_nomenclature_authority')]
                # = row[col.index('Nomenclature_status')]
                other_designations = row[col.index(
                    'Other_designations')].strip()
                # = row[col.index('Modification_date')}
                # = row[col.index('Feature_type')]

                if self.test_mode and int(gene_num) not in self.gene_ids:
                    continue
                if not self.test_mode and tax_num not in self.tax_ids:
                    continue
                tax_curie = ':'.join(('NCBITaxon', tax_num))
                gene_id = ':'.join(('NCBIGene', gene_num))

                gene_type_id = self.resolve(gtype)

                if symbol == 'NEWENTRY':
                    label = None
                else:
                    label = symbol
                # sequence feature, not a gene
                if gene_type_id == self.globaltt['sequence_feature']:
                    self.class_or_indiv[gene_id] = 'I'
                else:
                    self.class_or_indiv[gene_id] = 'C'

                if not self.test_mode and limit is not None and line_counter > limit:
                    continue

                if self.class_or_indiv[gene_id] == 'C':
                    model.addClassToGraph(gene_id, label, gene_type_id, desc)
                    # NCBI will be the default leader (for non mods),
                    # so we will not add the leader designation here.
                else:
                    model.addIndividualToGraph(gene_id, label, gene_type_id,
                                               desc)
                    # in this case, they aren't genes.
                    # so we want someone else to be the leader

                if name != '-':
                    model.addSynonym(gene_id, name)

                if synonyms != '-':
                    for syn in synonyms.split('|'):
                        syn = syn.strip()
                        # unknown curies may occur here
                        if syn[:12] == 'AnimalQTLdb:' and \
                                tax_curie in self.informal_species:
                            syn = self.informal_species[
                                tax_curie] + 'QTL:' + syn[12:]
                            LOG.info('AnimalQTLdb: CHANGED to: %s', syn)
                        model.addSynonym(gene_id, syn,
                                         model.globaltt['has_related_synonym'])
                if other_designations != '-':
                    for syn in other_designations.split('|'):
                        model.addSynonym(gene_id, syn.strip(),
                                         model.globaltt['has_related_synonym'])

                if dbxrefs != '-':
                    self._add_gene_equivalencies(dbxrefs, gene_id, tax_curie)

                # edge cases of id | symbol | chr | map_loc:
                # 263     AMD1P2    X|Y  with   Xq28 and Yq12
                # 438     ASMT      X|Y  with   Xp22.3 or Yp11.3    # in PAR
                # no idea why there's two bands listed - possibly 2 assemblies
                # 419     ART3      4    with   4q21.1|4p15.1-p14
                # 28227   PPP2R3B   X|Y  Xp22.33; Yp11.3            # in PAR
                # this is of "unknown" type == susceptibility
                # 619538  OMS     10|19|3 10q26.3;19q13.42-q13.43;3p25.3
                # unlocated scaffold
                # 101928066       LOC101928066    1|Un    -\
                # mouse --> 2C3
                # 11435   Chrna1  2       2 C3|2 43.76 cM
                # mouse --> 11B1.1
                # 11548   Adra1b  11      11 B1.1|11 25.81 cM
                # 11717   Ampd3   7       7 57.85 cM|7 E2-E3        # mouse
                # 14421   B4galnt1        10      10 D3|10 74.5 cM  # mouse
                # 323212  wu:fb92e12      19|20   -                 # fish
                # 323368  ints10  6|18    -                         # fish
                # 323666  wu:fc06e02      11|23   -                 # fish

                # feel that the chr placement can't be trusted in this table
                # when there is > 1 listed
                # with the exception of human X|Y,
                # we will only take those that align to one chr

                # FIXME remove the chr mapping below
                # when we pull in the genomic coords
                if chrom != '-' and chrom != '':
                    if re.search(r'\|',
                                 chrom) and chrom not in ['X|Y', 'X; Y']:
                        # means that there's uncertainty in the mapping.
                        # so skip it
                        # TODO we'll need to figure out how to deal with
                        # >1 loc mapping
                        LOG.info(
                            '%s is non-uniquely mapped to %s. Skipping for now.',
                            gene_id, chrom)
                        continue
                        # X|Y	Xp22.33;Yp11.3

                    # if(not re.match(
                    #        r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())):
                    #    print('odd chr=',str(chr))
                    if chrom == 'X; Y':
                        chrom = 'X|Y'  # rewrite the PAR regions for processing
                    # do this in a loop to allow PAR regions like X|Y
                    for chromosome in re.split(r'\|', chrom):
                        # assume that the chromosome label is added elsewhere
                        geno.addChromosomeClass(chromosome, tax_curie, None)
                        mychrom = makeChromID(chromosome, tax_num, 'CHR')
                        # temporarily use taxnum for the disambiguating label
                        mychrom_syn = makeChromLabel(chromosome, tax_num)
                        model.addSynonym(mychrom, mychrom_syn)

                        band_match = re.match(band_regex, map_loc)
                        if band_match is not None and len(
                                band_match.groups()) > 0:
                            # if tax_num != '9606':
                            #     continue
                            # this matches the regular kind of chrs,
                            # so make that kind of band
                            # not sure why this matches?
                            #   chrX|Y or 10090chr12|Un"
                            # TODO we probably need a different regex
                            # per organism
                            # the maploc_id already has the numeric chromosome
                            # in it, strip it first
                            bid = re.sub(r'^' + chromosome, '', map_loc)
                            # the generic location (no coordinates)
                            maploc_id = makeChromID(chromosome + bid, tax_num,
                                                    'CHR')
                            # print(map_loc,'-->',bid,'-->',maploc_id)
                            # Assume it's type will be added elsewhere
                            band = Feature(graph, maploc_id, None, None)
                            band.addFeatureToGraph()
                            # add the band as the containing feature
                            graph.addTriple(gene_id,
                                            self.globaltt['is subsequence of'],
                                            maploc_id)
                        else:
                            # TODO handle these cases: examples are:
                            # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24,
                            # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1,
                            # 12cen-q21,22q13.3|22q13.3
                            LOG.debug('not regular band pattern for %s: %s',
                                      gene_id, map_loc)
                            # add the gene as a subsequence of the chromosome
                            graph.addTriple(gene_id,
                                            self.globaltt['is subsequence of'],
                                            mychrom)

                geno.addTaxon(tax_curie, gene_id)
Пример #11
0
    def parse(self, limit=None):

        model = Model(self.graph)
        geno = Genotype(self.graph)

        count = 0
        for num in range(10, 100):
            fuzzy_gene = "MGI:{0}*".format(num)
            gene = "MGI:{0}".format(num)
            service = Service("http://www.mousemine.org/mousemine/service")
            logging.getLogger('Model').setLevel(logging.ERROR)
            logging.getLogger('JSONIterator').setLevel(logging.ERROR)
            query = service.new_query("OntologyAnnotation")
            query.add_constraint("subject", "SequenceFeature")
            query.add_constraint("ontologyTerm", "MPTerm")
            query.add_view("subject.primaryIdentifier", "subject.symbol",
                           "subject.sequenceOntologyTerm.name",
                           "ontologyTerm.identifier", "ontologyTerm.name",
                           "evidence.publications.pubMedId",
                           "evidence.comments.type",
                           "evidence.comments.description")
            query.add_constraint("subject.organism.taxonId",
                                 "=",
                                 self.txid,
                                 code="A")
            query.add_constraint("subject", "LOOKUP", fuzzy_gene, code="B")
            query.add_constraint("subject.primaryIdentifier",
                                 "CONTAINS",
                                 gene,
                                 code="C")
            query.outerjoin("evidence.comments")

            for row in query.rows():
                mgi_curie = row["subject.primaryIdentifier"]
                mp_curie = row["ontologyTerm.identifier"]
                pub_curie = "PMID:{0}".format(
                    row["evidence.publications.pubMedId"])

                model.addType(mgi_curie, self.globaltt['gene'])
                geno.addTaxon('NCBITaxon:' + self.txid, mgi_curie)

                assoc = G2PAssoc(self.graph, self.name, mgi_curie, mp_curie)
                if row["evidence.publications.pubMedId"]:
                    reference = Reference(self.graph, pub_curie,
                                          self.globaltt['journal article'])
                    reference.addRefToGraph()
                    assoc.add_source(pub_curie)

                assoc.add_evidence(
                    self.globaltt['experimental phenotypic evidence'])
                assoc.add_association_to_graph()

            if not count % 10 and count != 0:
                count_from = count - 10
                LOG.info("%s processed ids from MGI:%i* to MGI:%i*",
                         datetime.datetime.now(), count_from, count)

            count += 1
            if limit and count >= limit:
                break

        return
Пример #12
0
    def _process_haplotype(self, hap_id, hap_label, chrom_num, chrom_pos,
                           context, risk_allele_frequency, mapped_gene,
                           so_ontology):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        # add the feature to the graph
        hap_description = None
        if risk_allele_frequency not in ['', 'NR']:
            hap_description = str(
                risk_allele_frequency) + ' [risk allele frequency]'

        model.addIndividualToGraph(hap_id, hap_label.strip(),
                                   self.globaltt['haplotype'], hap_description)
        geno.addTaxon(self.globaltt["H**o sapiens"], hap_id)

        snp_labels = re.split(r';\s?', hap_label)
        chrom_nums = re.split(r';\s?', chrom_num)
        chrom_positions = re.split(r';\s?', chrom_pos)
        context_list = re.split(r';\s?', context)
        mapped_genes = re.split(r';\s?', mapped_gene)

        # Not having four "PAX5" as a list might be better, but it breaks unit tests
        # mapped_genes = list(set(mapped_genes)) # make uniq
        # snp_labels = list(set(snp_labels)) # make uniq

        snp_curies = list()

        for snp in snp_labels:
            snp_curie, snp_type = self._get_curie_and_type_from_id(snp)
            if snp_type is None:
                LOG.info('cant find type for SNP in %s', snp)
                # make blank node
                snp_curie = self.make_id(snp, "_")
                model.addLabel(snp_curie, snp)
            elif snp_curie[0] == '_':  # arrived an unlabeled blanknode
                model.addLabel(snp_curie, snp)

            graph.addTriple(hap_id, self.globaltt['has_variant_part'],
                            snp_curie)
            snp_curies.append(snp_curie)

        # courtesy http://stackoverflow.com/a/16720915
        # check lengths of mutiple lists
        length = len(snp_curies)
        if not all(
                len(lst) == length for lst in
            [snp_labels, chrom_nums, chrom_positions, context_list]):
            LOG.warning(
                "Incongruous data field(s) for haplotype %s \n "
                "will not add snp details", hap_label)
        else:

            variant_in_gene_count = 0
            for index, snp_curie in enumerate(snp_curies):
                self._add_snp_to_graph(snp_curie, snp_labels[index],
                                       chrom_nums[index],
                                       chrom_positions[index],
                                       context_list[index])

                if mapped_genes and len(mapped_genes) != len(snp_labels):
                    LOG.warning("More mapped genes than snps,"
                                " cannot disambiguate for\n%s\n%s",
                                mapped_genes, snp_labels)  # hap_label)
                else:
                    so_class = self.resolve(context_list[index])
                    so_query = """
        SELECT ?variant_label
        WHERE {{
            {0} rdfs:subClassOf+ {1} ;
            rdfs:label ?variant_label .
        }}
                    """.format(so_class, self.globaltt['gene_variant'])

                    query_result = so_ontology.query(so_query)

                    gene_id = DipperUtil.get_hgnc_id_from_symbol(
                        mapped_genes[index])

                    if gene_id is not None and len(list(query_result)) == 1:
                        if context_list[index] in [
                                'upstream_gene_variant',
                                'downstream_gene_variant'
                        ]:
                            graph.addTriple(snp_curie,
                                            self.resolve(context_list[index]),
                                            gene_id)
                        else:
                            geno.addAffectedLocus(snp_curie, gene_id)
                            variant_in_gene_count += 1

            # Seperate in case we want to apply a different relation
            # If not this is redundant with triples added above
            if len(mapped_genes) == variant_in_gene_count and \
                    len(set(mapped_genes)) == 1:
                gene_id = DipperUtil.get_hgnc_id_from_symbol(mapped_genes[0])
                geno.addAffectedLocus(hap_id, gene_id)
Пример #13
0
    def _process_genes(self, taxid, limit=None):
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        geno = Genotype(g)

        raw = '/'.join((self.rawdir, self.files[taxid]['file']))
        line_counter = 0
        logger.info("Processing Ensembl genes for tax %s", taxid)
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t')
            for row in filereader:
                if len(row) < 4:
                    raise ValueError("Data error for file %s", raw)
                (ensembl_gene_id, external_gene_name,
                 description, gene_biotype, entrezgene,
                 peptide_id, uniprot_swissprot) = row[0:7]

                # in the case of human genes, we also get the hgnc id,
                # and is the last col
                if taxid == '9606':
                    hgnc_id = row[7]
                else:
                    hgnc_id = None

                if self.testMode and entrezgene != '' \
                        and int(entrezgene) not in self.gene_ids:
                    continue

                line_counter += 1
                gene_id = 'ENSEMBL:' + ensembl_gene_id
                peptide_curie = 'ENSEMBL:{}'.format(peptide_id)
                uniprot_curie = 'UniProtKB:{}'.format(uniprot_swissprot)
                entrez_curie = 'NCBIGene:{}'.format(entrezgene)

                if description == '':
                    description = None
                # gene_type_id = self._get_gene_type(gene_biotype)
                gene_type_id = None
                model.addClassToGraph(
                    gene_id, external_gene_name, gene_type_id, description)
                model.addIndividualToGraph(peptide_curie, None, self._get_gene_type("polypeptide"))
                model.addIndividualToGraph(uniprot_curie, None, self._get_gene_type("polypeptide"))

                if entrezgene != '':
                    model.addEquivalentClass(gene_id, entrez_curie)
                if hgnc_id is not None and hgnc_id != '':
                    model.addEquivalentClass(gene_id, hgnc_id)
                geno.addTaxon('NCBITaxon:'+taxid, gene_id)
                if peptide_id != '':
                    geno.addGeneProduct(gene_id, peptide_curie)
                    if uniprot_swissprot != '':
                        geno.addGeneProduct(gene_id, uniprot_curie)
                        model.addXref(peptide_curie, uniprot_curie)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Пример #14
0
    def _process_haplotype(self, hap_id, hap_label, chrom_num, chrom_pos,
                           context, risk_allele_frequency, mapped_gene,
                           so_ontology):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        # add the feature to the graph
        hap_description = None
        if risk_allele_frequency != '' and risk_allele_frequency != 'NR':
            hap_description = str(
                risk_allele_frequency) + ' [risk allele frequency]'

        model.addIndividualToGraph(hap_id, hap_label.strip(),
                                   self.globaltt['haplotype'], hap_description)
        geno.addTaxon(self.globaltt["H**o sapiens"], hap_id)

        snp_labels = re.split(r';\s?', hap_label)
        chrom_nums = re.split(r';\s?', chrom_num)
        chrom_positions = re.split(r';\s?', chrom_pos)
        context_list = re.split(r';\s?', context)
        mapped_genes = re.split(r';\s?', mapped_gene)
        snp_curies = list()

        for index, snp in enumerate(snp_labels):
            snp_curie, snp_type = self._get_curie_and_type_from_id(snp)
            if snp_type is None:
                # make blank node
                snp_curie = self.make_id(snp, "_")

            graph.addTriple(hap_id, self.globaltt['has_variant_part'],
                            snp_curie)
            snp_curies.append(snp_curie)

        # courtesy http://stackoverflow.com/a/16720915
        length = len(snp_labels)
        if not all(
                len(lst) == length
                for lst in [chrom_nums, chrom_positions, context_list]):
            LOG.warning(
                "Unexpected data field for haplotype %s \n "
                "will not add snp details", hap_label)
            return

        variant_in_gene_count = 0
        for index, snp_curie in enumerate(snp_curies):
            self._add_snp_to_graph(snp_curie, snp_labels[index],
                                   chrom_nums[index], chrom_positions[index],
                                   context_list[index])

            if len(mapped_genes) == len(snp_labels):
                so_class = self.resolve(context_list[index])
                # removed the '+' for recursive  one-or-more rdfs:subClassOf  paths
                # just so it did not return an empty graph
                so_query = """
SELECT ?variant_label
    WHERE {{
        {0} rdfs:subClassOf {1} ;
        rdfs:label ?variant_label .
    }}
                """.format(so_class, self.globaltt['gene_variant'])

                query_result = so_ontology.query(so_query)

                if len(list(query_result)) == 1:
                    gene_id = DipperUtil.get_ncbi_id_from_symbol(
                        mapped_genes[index])

                    if gene_id is not None:
                        geno.addAffectedLocus(snp_curie, gene_id)
                        geno.addAffectedLocus(hap_id, gene_id)
                        variant_in_gene_count += 1

                gene_id = DipperUtil.get_ncbi_id_from_symbol(
                    mapped_genes[index])
                if gene_id is not None:
                    graph.addTriple(snp_curie,
                                    self.resolve(context_list[index]), gene_id)

            else:
                LOG.warning(
                    "More mapped genes than snps, cannot disambiguate for %s",
                    hap_label)

        # Seperate in case we want to apply a different relation
        # If not this is redundant with triples added above
        if len(mapped_genes) == variant_in_gene_count and len(
                set(mapped_genes)) == 1:
            gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[0])
            geno.addAffectedLocus(hap_id, gene_id)

        return
Пример #15
0
    def process_gaf(self, file, limit, id_map=None):

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        geno = Genotype(g)
        logger.info("Processing Gene Associations from %s", file)
        line_counter = 0

        if 7955 in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        elif 6239 in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue
                (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol,
                 with_or_from, aspect, gene_name, gene_synonym, object_type,
                 taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (db == '' or gene_num == '' or gene_symbol == '' or
                        go_id == '' or ref == '' or eco_symbol == '' or
                        aspect == '' or object_type == '' or taxon == '' or
                        date == '' or assigned_by == ''):
                    logger.error(
                        "Missing required part of annotation " +
                        "on row %d:\n"+'\t'.join(row),
                        line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                db = self.clean_db_prefix(db)
                uniprotid = None
                gene_id = None
                if db == 'UniProtKB':
                    mapped_ids = id_map.get(gene_num)
                    if id_map is not None and mapped_ids is not None:
                        if len(mapped_ids) == 1:
                            gene_id = mapped_ids[0]
                            uniprotid = ':'.join((db, gene_num))
                            gene_num = re.sub(r'\w+\:', '', gene_id)
                        elif len(mapped_ids) > 1:
                            # logger.warning(
                            #   "Skipping gene id mapped for >1 gene %s -> %s",
                            #    gene_num, str(mapped_ids))
                            continue
                    else:
                        continue
                elif db == 'MGI':
                    gene_num = re.sub(r'MGI:', '', gene_num)
                    gene_id = ':'.join((db, gene_num))
                    gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id)
                else:
                    gene_id = ':'.join((db, gene_num))

                if self.testMode \
                        and not(
                            re.match(r'NCBIGene', gene_id) and
                            int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for s in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, s.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    logger.info(">1 taxon (%s) on line %d.  skipping", taxon,
                                line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(g, self.name)

                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                eco_id = self.map_go_evidence_code_to_eco(eco_symbol)
                if eco_id is not None:
                    assoc.add_evidence(eco_id)

                refs = re.split(r'\|', ref)
                for r in refs:
                    r = r.strip()
                    if r != '':
                        prefix = re.split(r':', r)[0]
                        r = re.sub(prefix, self.clean_db_prefix(prefix), r)
                        r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                        ref = Reference(g, r)
                        if re.match(r'PMID', r):
                            ref_type = Reference.ref_types['journal_article']
                            ref.setType(ref_type)
                        ref.addRefToGraph()
                        assoc.add_source(r)

                # TODO add the source of the annotations from assigned by?

                aspect_rel_map = {
                    'P': model.object_properties['involved_in'],  # involved in
                    'F': model.object_properties['enables'],  # enables
                    'C': model.object_properties['part_of']  # part of
                }

                if aspect not in aspect_rel_map:
                    logger.error("Aspect not recognized: %s", aspect)

                rel = aspect_rel_map.get(aspect)
                if aspect == 'F' and re.search(r'contributes_to', qualifier):
                    rel = model.object_properties['contributes_to']
                assoc.set_relationship(rel)
                if uniprotid is not None:
                    assoc.set_description('Mapped from '+uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used

                assoc.add_association_to_graph()

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id+'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or \
                                re.match(
                                    r'(UniProtKB|WBPhenotype|InterPro|HGNC)',
                                    i):
                            logger.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s",
                                uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(i, gene_id,
                                                        targeted_gene_id)
                            # TODO PYLINT why is this:
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(g, self.name, targeted_gene_id,
                                             phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = \
                                wbase.make_reagent_targeted_gene_id(
                                    gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(
                                g, self.name, targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(g, self.name, i, phenotypeid)
                        for r in refs:
                            r = r.strip()
                            if r != '':
                                prefix = re.split(r':', r)[0]
                                r = re.sub(
                                    prefix, self.clean_db_prefix(prefix), r)
                                r = re.sub(r'MGI\:MGI\:', 'MGI:', r)
                                assoc.add_source(r)
                                # experimental phenotypic evidence
                                assoc.add_evidence("ECO:0000059")
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return
Пример #16
0
    def _process_data(self, raw, limit=None):
        logger.info("Processing Data from %s", raw)

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        geno = Genotype(g)
        line_counter = 0

        impc_map = self.open_and_parse_yaml(self.map_files['impc_map'])
        impress_map = json.loads(
            self.fetch_from_url(
                self.map_files['impress_map']).read().decode('utf-8'))

        # Add the taxon as a class
        taxon_id = 'NCBITaxon:10090'  # map to Mus musculus
        model.addClassToGraph(taxon_id, None)

        # with open(raw, 'r', encoding="utf8") as csvfile:
        with gzip.open(raw, 'rt') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                line_counter += 1

                (marker_accession_id, marker_symbol, phenotyping_center,
                 colony, sex, zygosity, allele_accession_id, allele_symbol,
                 allele_name, strain_accession_id, strain_name, project_name,
                 project_fullname, pipeline_name, pipeline_stable_id,
                 procedure_stable_id, procedure_name, parameter_stable_id,
                 parameter_name, top_level_mp_term_id, top_level_mp_term_name,
                 mp_term_id, mp_term_name, p_value, percentage_change,
                 effect_size, statistical_method, resource_name) = row

                if self.testMode and marker_accession_id not in self.test_ids:
                    continue

                # ##### cleanup some of the identifiers ######
                zygosity_id = self._map_zygosity(zygosity)

                # colony ids sometimes have <> in them, spaces,
                # or other non-alphanumerics and break our system;
                # replace these with underscores
                colony_id = '_:' + re.sub(r'\W+', '_', colony)

                if not re.match(r'MGI', allele_accession_id):
                    allele_accession_id = \
                        '_:IMPC-'+re.sub(r':', '', allele_accession_id)

                if re.search(r'EUROCURATE', strain_accession_id):
                    # the eurocurate links don't resolve at IMPC
                    strain_accession_id = '_:' + strain_accession_id

                elif not re.match(r'MGI', strain_accession_id):
                    logger.info("Found a strange strain accession...%s",
                                strain_accession_id)
                    strain_accession_id = 'IMPC:' + strain_accession_id

                ######################
                # first, add the marker and variant to the graph as with MGI,
                # the allele is the variant locus.  IF the marker is not known,
                # we will call it a sequence alteration.  otherwise,
                # we will create a BNode for the sequence alteration.
                sequence_alteration_id = variant_locus_id = None
                variant_locus_name = sequence_alteration_name = None

                # extract out what's within the <> to get the symbol
                if re.match(r'.*<.*>', allele_symbol):
                    sequence_alteration_name = \
                        re.match(r'.*<(.*)>', allele_symbol).group(1)
                else:
                    sequence_alteration_name = allele_symbol

                if marker_accession_id is not None and \
                        marker_accession_id == '':
                    logger.warning("Marker unspecified on row %d",
                                   line_counter)
                    marker_accession_id = None

                if marker_accession_id is not None:
                    variant_locus_id = allele_accession_id
                    variant_locus_name = allele_symbol
                    variant_locus_type = geno.genoparts['variant_locus']
                    geno.addGene(marker_accession_id, marker_symbol,
                                 geno.genoparts['gene'])
                    geno.addAllele(variant_locus_id, variant_locus_name,
                                   variant_locus_type, None)
                    geno.addAlleleOfGene(variant_locus_id, marker_accession_id)

                    sequence_alteration_id = \
                        '_:seqalt'+re.sub(r':', '', allele_accession_id)
                    geno.addSequenceAlterationToVariantLocus(
                        sequence_alteration_id, variant_locus_id)

                else:
                    sequence_alteration_id = allele_accession_id

                # IMPC contains targeted mutations with either gene traps,
                # knockouts, insertion/intragenic deletions.
                # but I don't really know what the SeqAlt is here,
                # so I don't add it.
                geno.addSequenceAlteration(sequence_alteration_id,
                                           sequence_alteration_name)

                # #############    BUILD THE COLONY    #############
                # First, let's describe the colony that the animals come from
                # The Colony ID refers to the ES cell clone
                #   used to generate a mouse strain.
                # Terry sez: we use this clone ID to track
                #   ES cell -> mouse strain -> mouse phenotyping.
                # The same ES clone maybe used at multiple centers,
                # so we have to concatenate the two to have a unique ID.
                # some useful reading about generating mice from ES cells:
                # http://ki.mit.edu/sbc/escell/services/details

                # here, we'll make a genotype
                # that derives from an ES cell with a given allele.
                # the strain is not really attached to the colony.

                # the colony/clone is reflective of the allele,
                # with unknown zygosity
                stem_cell_class = 'ERO:0002002'
                model.addIndividualToGraph(colony_id, colony, stem_cell_class)

                # vslc of the colony has unknown zygosity
                # note that we will define the allele
                # (and it's relationship to the marker, etc.) later
                # FIXME is it really necessary to create this vslc
                # when we always know it's unknown zygosity?
                vslc_colony = \
                    '_:'+re.sub(r':', '', allele_accession_id+geno.zygosity['indeterminate'])
                vslc_colony_label = allele_symbol + '/<?>'
                # for ease of reading, we make the colony genotype variables.
                # in the future, it might be desired to keep the vslcs
                colony_genotype_id = vslc_colony
                colony_genotype_label = vslc_colony_label
                geno.addGenotype(colony_genotype_id, colony_genotype_label)
                geno.addParts(allele_accession_id, colony_genotype_id,
                              geno.object_properties['has_alternate_part'])
                geno.addPartsToVSLC(
                    vslc_colony, allele_accession_id, None,
                    geno.zygosity['indeterminate'],
                    geno.object_properties['has_alternate_part'])
                g.addTriple(colony_id, geno.object_properties['has_genotype'],
                            colony_genotype_id)

                # ##########    BUILD THE ANNOTATED GENOTYPE    ##########
                # now, we'll build the genotype of the individual that derives
                # from the colony/clone genotype that is attached to
                # phenotype = colony_id + strain + zygosity + sex
                # (and is derived from a colony)

                # this is a sex-agnostic genotype
                genotype_id = \
                    self.make_id(
                        (colony_id + phenotyping_center + zygosity +
                         strain_accession_id))
                geno.addSequenceDerivesFrom(genotype_id, colony_id)

                # build the VSLC of the sex-agnostic genotype
                # based on the zygosity
                allele1_id = allele_accession_id
                allele2_id = allele2_rel = None
                allele1_label = allele_symbol
                allele2_label = '<?>'
                # Making VSLC labels from the various parts,
                # can change later if desired.
                if zygosity == 'heterozygote':
                    allele2_label = re.sub(r'<.*', '<+>', allele1_label)
                    allele2_id = None
                elif zygosity == 'homozygote':
                    allele2_label = allele1_label
                    allele2_id = allele1_id
                    allele2_rel = geno.object_properties['has_alternate_part']
                elif zygosity == 'hemizygote':
                    allele2_label = re.sub(r'<.*', '<0>', allele1_label)
                    allele2_id = None
                elif zygosity == 'not_applicable':
                    allele2_label = re.sub(r'<.*', '<?>', allele1_label)
                    allele2_id = None
                else:
                    logger.warning("found unknown zygosity %s", zygosity)
                    break
                vslc_name = '/'.join((allele1_label, allele2_label))

                # Add the VSLC
                vslc_id = '-'.join(
                    (marker_accession_id, allele_accession_id, zygosity))
                vslc_id = re.sub(r':', '', vslc_id)
                vslc_id = '_:' + vslc_id
                model.addIndividualToGraph(
                    vslc_id, vslc_name,
                    geno.genoparts['variant_single_locus_complement'])
                geno.addPartsToVSLC(
                    vslc_id, allele1_id, allele2_id, zygosity_id,
                    geno.object_properties['has_alternate_part'], allele2_rel)

                # add vslc to genotype
                geno.addVSLCtoParent(vslc_id, genotype_id)

                # note that the vslc is also the gvc
                model.addType(
                    vslc_id,
                    Genotype.genoparts['genomic_variation_complement'])

                # Add the genomic background
                # create the genomic background id and name
                if strain_accession_id != '':
                    genomic_background_id = strain_accession_id
                else:
                    genomic_background_id = None

                genotype_name = vslc_name
                if genomic_background_id is not None:
                    geno.addGenotype(genomic_background_id, strain_name,
                                     geno.genoparts['genomic_background'])

                    # make a phenotyping-center-specific strain
                    # to use as the background
                    pheno_center_strain_label = \
                        strain_name + '-' + phenotyping_center + '-' + colony
                    pheno_center_strain_id = \
                        '-'.join((re.sub(r':', '', genomic_background_id),
                                  re.sub(r'\s', '_', phenotyping_center),
                                  re.sub(r'\W+', '', colony)))
                    if not re.match(r'^_', pheno_center_strain_id):
                        pheno_center_strain_id = '_:' + pheno_center_strain_id

                    geno.addGenotype(pheno_center_strain_id,
                                     pheno_center_strain_label,
                                     geno.genoparts['genomic_background'])
                    geno.addSequenceDerivesFrom(pheno_center_strain_id,
                                                genomic_background_id)

                    # Making genotype labels from the various parts,
                    # can change later if desired.
                    # since the genotype is reflective of the place
                    # it got made, should put that in to disambiguate
                    genotype_name = \
                        genotype_name+' ['+pheno_center_strain_label+']'
                    geno.addGenomicBackgroundToGenotype(
                        pheno_center_strain_id, genotype_id)
                    geno.addTaxon(taxon_id, pheno_center_strain_id)
                # this is redundant, but i'll keep in in for now
                geno.addSequenceDerivesFrom(genotype_id, colony_id)
                geno.addGenotype(genotype_id, genotype_name)

                # Make the sex-qualified genotype,
                # which is what the phenotype is associated with
                sex_qualified_genotype_id = \
                    self.make_id(
                        (colony_id + phenotyping_center + zygosity +
                         strain_accession_id+sex))
                sex_qualified_genotype_label = genotype_name + ' (' + sex + ')'
                if sex == 'male':
                    sq_type_id = geno.genoparts['male_genotype']
                elif sex == 'female':
                    sq_type_id = geno.genoparts['female_genotype']
                else:
                    sq_type_id = geno.genoparts['sex_qualified_genotype']

                geno.addGenotype(sex_qualified_genotype_id,
                                 sex_qualified_genotype_label, sq_type_id)
                geno.addParts(genotype_id, sex_qualified_genotype_id,
                              geno.object_properties['has_alternate_part'])

                if genomic_background_id is not None and \
                        genomic_background_id != '':
                    # Add the taxon to the genomic_background_id
                    geno.addTaxon(taxon_id, genomic_background_id)
                else:
                    # add it as the genomic background
                    geno.addTaxon(taxon_id, genotype_id)

                # #############    BUILD THE G2P ASSOC    #############
                # from an old email dated July 23 2014:
                # Phenotypes associations are made to
                # imits colony_id+center+zygosity+gender

                phenotype_id = mp_term_id

                # it seems that sometimes phenotype ids are missing.
                # indicate here
                if phenotype_id is None or phenotype_id == '':
                    logger.warning("No phenotype id specified for row %d: %s",
                                   line_counter, str(row))
                    continue
                # hard coded ECO code
                eco_id = "ECO:0000015"

                # the association comes as a result of a g2p from
                # a procedure in a pipeline at a center and parameter tested

                assoc = G2PAssoc(g, self.name, sex_qualified_genotype_id,
                                 phenotype_id)
                assoc.add_evidence(eco_id)
                # assoc.set_score(float(p_value))

                # TODO add evidence instance using
                # pipeline_stable_id +
                # procedure_stable_id +
                # parameter_stable_id

                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()

                # add a free-text description
                try:
                    description = \
                        ' '.join((mp_term_name, 'phenotype determined by',
                                  phenotyping_center, 'in an',
                                  procedure_name, 'assay where',
                                  parameter_name.strip(),
                                  'was measured with an effect_size of',
                                  str(round(float(effect_size), 5)),
                                  '(p =', "{:.4e}".format(float(p_value)), ').'))
                except ValueError:
                    description = \
                        ' '.join((mp_term_name, 'phenotype determined by',
                                  phenotyping_center, 'in an',
                                  procedure_name, 'assay where',
                                  parameter_name.strip(),
                                  'was measured with an effect_size of',
                                  str(effect_size),
                                  '(p =', "{0}".format(p_value), ').'))

                study_bnode = \
                    self._add_study_provenance(
                        impc_map, impress_map, phenotyping_center, colony,
                        project_fullname, pipeline_name, pipeline_stable_id,
                        procedure_stable_id, procedure_name,
                        parameter_stable_id, parameter_name,
                        statistical_method, resource_name)

                evidence_line_bnode = \
                    self._add_evidence(
                        assoc_id, eco_id, impc_map, p_value, percentage_change,
                        effect_size, study_bnode)

                self._add_assertion_provenance(assoc_id, evidence_line_bnode,
                                               impc_map)

                model.addDescription(evidence_line_bnode, description)

                # resource_id = resource_name
                # assoc.addSource(g, assoc_id, resource_id)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return
Пример #17
0
    def _process_phenotype_data(self, limit):
        """
        NOTE: If a Strain carries more than one mutation,
        then each Mutation description,
        i.e., the set: (
            Mutation Type - Chromosome - Gene Symbol -
            Gene Name - Allele Symbol - Allele Name)
        will require a separate line.

        Note that MMRRC curates phenotypes to alleles,
        even though they distribute only one file with the
        phenotypes appearing to be associated with a strain.

        So, here we process the allele-to-phenotype relationships separately
        from the strain-to-allele relationships.

        :param limit:
        :return:

        """

        src_key = 'catalog'
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        fname = '/'.join((self.rawdir, self.files[src_key]['file']))

        self.strain_hash = {}
        self.id_label_hash = {}
        genes_with_no_ids = set()
        stem_cell_class = self.globaltt['stem cell']
        mouse_taxon = self.globaltt['Mus musculus']
        geno = Genotype(graph)
        with open(fname, 'r', encoding="utf8") as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            # This MMRRC catalog data file was generated on YYYY-MM-DD
            # insert or check date w/dataset
            line = next(reader)
            # gen_date = line[-10:]
            line = next(reader)
            col = self.files['catalog']['columns']
            if col != line:
                LOG.error(
                    '%s\nExpected Headers:\t%s\nRecived Headers:\t%s\n',
                    src_key, col, line)
                LOG.info(set(col) - set(line))

            line = next(reader)
            if line != []:
                LOG.warning('Expected third line to be blank. got "%s" instead', line)

            for row in reader:
                strain_id = row[col.index('STRAIN/STOCK_ID')].strip()
                strain_label = row[col.index('STRAIN/STOCK_DESIGNATION')]
                # strain_type_symbol = row[col.index('STRAIN_TYPE')]
                strain_state = row[col.index('STATE')]
                mgi_allele_id = row[col.index('MGI_ALLELE_ACCESSION_ID')].strip()
                mgi_allele_symbol = row[col.index('ALLELE_SYMBOL')]
                # mgi_allele_name = row[col.index('ALLELE_NAME')]
                # mutation_type = row[col.index('MUTATION_TYPE')]
                # chrom = row[col.index('CHROMOSOME')]
                mgi_gene_id = row[col.index('MGI_GENE_ACCESSION_ID')].strip()
                mgi_gene_symbol = row[col.index('GENE_SYMBOL')].strip()
                mgi_gene_name = row[col.index('GENE_NAME')]
                # sds_url = row[col.index('SDS_URL')]
                # accepted_date = row[col.index('ACCEPTED_DATE')]
                mpt_ids = row[col.index('MPT_IDS')].strip()
                pubmed_nums = row[col.index('PUBMED_IDS')].strip()
                research_areas = row[col.index('RESEARCH_AREAS')].strip()

                if self.test_mode and (strain_id not in self.test_ids) \
                        or mgi_gene_name == 'withdrawn':
                    continue

                # strip off stuff after the dash -
                # is the holding center important?
                # MMRRC:00001-UNC --> MMRRC:00001
                strain_id = re.sub(r'-\w+$', '', strain_id)

                self.id_label_hash[strain_id] = strain_label

                # get the variant or gene to save for later building of
                # the genotype
                if strain_id not in self.strain_hash:
                    self.strain_hash[strain_id] = {
                        'variants': set(), 'genes': set()}

                # flag bad ones
                if mgi_allele_id[:4] != 'MGI:' and mgi_allele_id != '':
                    LOG.error("Erroneous MGI allele id: %s", mgi_allele_id)
                    if mgi_allele_id[:3] == 'MG:':
                        mgi_allele_id = 'MGI:' + mgi_allele_id[3:]
                    else:
                        mgi_allele_id = ''

                if mgi_allele_id != '':
                    self.strain_hash[strain_id]['variants'].add(mgi_allele_id)
                    self.id_label_hash[mgi_allele_id] = mgi_allele_symbol

                    # use the following if needing to add the sequence alteration types
                    # var_type = self.localtt[mutation_type]
                    # make a sequence alteration for this variant locus,
                    # and link the variation type to it
                    # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA'
                    # if self.nobnodes:
                    #     sa_id = ':'+sa_id
                    # gu.addIndividualToGraph(g, sa_id, None, var_type)
                    # geno.addSequenceAlterationToVariantLocus(sa_id, mgi_allele_id)

                # scrub out any spaces, fix known issues
                mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id)
                if mgi_gene_id == 'NULL':
                    mgi_gene_id = ''
                elif mgi_gene_id[:7] == 'GeneID:':
                    mgi_gene_id = 'NCBIGene:' + mgi_gene_id[7:]

                if mgi_gene_id != '':
                    [curie, localid] = mgi_gene_id.split(':')
                    if curie not in ['MGI', 'NCBIGene']:
                        LOG.info("MGI Gene id not recognized: %s", mgi_gene_id)
                    self.strain_hash[strain_id]['genes'].add(mgi_gene_id)
                    self.id_label_hash[mgi_gene_id] = mgi_gene_symbol

                # catch some errors - too many. report summary at the end
                # some things have gene labels, but no identifiers - report
                if mgi_gene_symbol != '' and mgi_gene_id == '':
                    # LOG.error(
                    #    "Gene label with no MGI identifier for strain %s: %s",
                    #    strain_id, mgi_gene_symbol)
                    genes_with_no_ids.add(mgi_gene_symbol)
                    # make a temp id for genes that aren't identified ... err wow.
                    # tmp_gene_id = '_' + mgi_gene_symbol
                    # self.id_label_hash[tmp_gene_id.strip()] = mgi_gene_symbol
                    # self.strain_hash[strain_id]['genes'].add(tmp_gene_id)

                # split apart the mp ids
                # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ...
                # mpt_ids are a comma delimited list
                # labels with MP terms following in brackets
                phenotype_ids = []
                if mpt_ids != '':
                    for lb_mp in mpt_ids.split(r','):
                        lb_mp = lb_mp.strip()
                        if lb_mp[-1:] == ']' and lb_mp[-12:-8] == '[MP:':
                            phenotype_ids.append(lb_mp[-11:-2])

                # pubmed ids are space delimited
                pubmed_ids = []
                if pubmed_nums != '':
                    for pm_num in re.split(r'\s+', pubmed_nums):
                        pmid = 'PMID:' + pm_num.strip()
                        pubmed_ids.append(pmid)
                        ref = Reference(graph, pmid, self.globaltt['journal article'])
                        ref.addRefToGraph()

                # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001
                # is a good example of 4 genotype parts

                model.addClassToGraph(mouse_taxon, None)
                if research_areas == '':
                    research_areas = None
                else:
                    research_areas = 'Research Areas: ' + research_areas
                strain_type = mouse_taxon
                if strain_state == 'ES':
                    strain_type = stem_cell_class
                model.addIndividualToGraph(   # an inst of mouse??
                    strain_id, strain_label, strain_type, research_areas)
                model.makeLeader(strain_id)

                # phenotypes are associated with the alleles
                for pid in phenotype_ids:
                    # assume the phenotype label is in some ontology
                    model.addClassToGraph(pid, None)
                    if mgi_allele_id is not None and mgi_allele_id != '':
                        assoc = G2PAssoc(
                            graph, self.name, mgi_allele_id, pid,
                            self.globaltt['has phenotype'])
                        for p in pubmed_ids:
                            assoc.add_source(p)
                        assoc.add_association_to_graph()
                    else:
                        LOG.info("Phenotypes and no allele for %s", strain_id)

                if not self.test_mode and (
                        limit is not None and reader.line_num > limit):
                    break

            # now that we've collected all of the variant information, build it
            # we don't know their zygosities
            for s in self.strain_hash:
                h = self.strain_hash.get(s)
                variants = h['variants']
                genes = h['genes']
                vl_set = set()
                # make variant loci for each gene
                if len(variants) > 0:
                    for var in variants:
                        vl_id = var.strip()
                        vl_symbol = self.id_label_hash[vl_id]
                        geno.addAllele(
                            vl_id, vl_symbol, self.globaltt['variant_locus'])
                        vl_set.add(vl_id)
                        if len(variants) == 1 and len(genes) == 1:
                            for gene in genes:
                                geno.addAlleleOfGene(vl_id, gene)
                        else:
                            geno.addAllele(vl_id, vl_symbol)
                else:  # len(vars) == 0
                    # it's just anonymous variants in some gene
                    for gene in genes:
                        vl_id = '_:' + re.sub(r':', '', gene) + '-VL'
                        vl_symbol = self.id_label_hash[gene]+'<?>'
                        self.id_label_hash[vl_id] = vl_symbol
                        geno.addAllele(
                            vl_id, vl_symbol, self.globaltt['variant_locus'])
                        geno.addGene(gene, self.id_label_hash[gene])
                        geno.addAlleleOfGene(vl_id, gene)
                        vl_set.add(vl_id)

                # make the vslcs
                vl_list = sorted(vl_set)
                vslc_list = []
                for vl in vl_list:
                    # for unknown zygosity
                    vslc_id = re.sub(r'^_', '', vl)+'U'
                    vslc_id = re.sub(r':', '', vslc_id)
                    vslc_id = '_:' + vslc_id
                    vslc_label = self.id_label_hash[vl] + '/?'
                    self.id_label_hash[vslc_id] = vslc_label
                    vslc_list.append(vslc_id)
                    geno.addPartsToVSLC(
                        vslc_id, vl, None, self.globaltt['indeterminate'],
                        self.globaltt['has_variant_part'], None)
                    model.addIndividualToGraph(
                        vslc_id, vslc_label,
                        self.globaltt['variant single locus complement'])
                if len(vslc_list) > 0:
                    if len(vslc_list) > 1:
                        gvc_id = '-'.join(vslc_list)
                        gvc_id = re.sub(r'_|:', '', gvc_id)
                        gvc_id = '_:'+gvc_id
                        gvc_label = '; '.join(self.id_label_hash[v] for v in vslc_list)
                        model.addIndividualToGraph(
                            gvc_id, gvc_label,
                            self.globaltt['genomic_variation_complement'])
                        for vslc_id in vslc_list:
                            geno.addVSLCtoParent(vslc_id, gvc_id)
                    else:
                        # the GVC == VSLC, so don't have to make an extra piece
                        gvc_id = vslc_list.pop()
                        gvc_label = self.id_label_hash[gvc_id]

                    genotype_label = gvc_label + ' [n.s.]'
                    bkgd_id = re.sub(
                        r':', '', '-'.join((
                            self.globaltt['unspecified_genomic_background'], s)))
                    genotype_id = '-'.join((gvc_id, bkgd_id))
                    bkgd_id = '_:' + bkgd_id
                    geno.addTaxon(mouse_taxon, bkgd_id)
                    geno.addGenomicBackground(
                        bkgd_id, 'unspecified (' + s + ')',
                        self.globaltt['unspecified_genomic_background'],
                        "A placeholder for the unspecified genetic background for " + s)
                    geno.addGenomicBackgroundToGenotype(
                        bkgd_id, genotype_id,
                        self.globaltt['unspecified_genomic_background'])
                    geno.addParts(
                        gvc_id, genotype_id, self.globaltt['has_variant_part'])
                    geno.addGenotype(genotype_id, genotype_label)
                    graph.addTriple(
                        s, self.globaltt['has_genotype'], genotype_id)
                else:
                    # LOG.debug(
                    #   "Strain %s is not making a proper genotype.", s)
                    pass

            LOG.warning(
                "The following gene symbols did not list identifiers: %s",
                str(sorted(list(genes_with_no_ids))))
            LOG.error(
                '%i symbols given are missing their gene identifiers',
                len(genes_with_no_ids))

        return
Пример #18
0
    def _process_QTLs_genomic_location(
            self, raw, taxon_id, build_id, build_label, limit=None):
        """
        This method

        Triples created:

        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        line_counter = 0
        geno = Genotype(g)
        # assume that chrs get added to the genome elsewhere
        # genome_id = geno.makeGenomeID(taxon_id)  # TODO unused

        eco_id = "ECO:0000061"  # Quantitative Trait Analysis Evidence
        logger.info("Processing QTL locations for %s", taxon_id)
        with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            # bad_attr_flag = False  # TODO unused
            for row in reader:
                line_counter += 1
                if re.match(r'^#', ' '.join(row)):
                    continue

                (chromosome, qtl_source, qtl_type, start_bp, stop_bp, frame,
                 strand, score, attr) = row

                # Chr.Z   Animal QTLdb    Production_QTL  33954873      34023581        .       .       .
                # QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234;
                # trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass";
                # CMO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian";
                # Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52";
                # Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01"

                # make dictionary of attributes
                # keys are:
                # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,FlankMarkers,
                # VTO_name,Map_Type,Significance,P-value,Model,
                # Test_Base,Variance, Bayes-value,PTO_name,gene_IDsrc,peak_cM,
                # CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect,
                # Dominance_Effect,Likelihood_Ratio,LS-means,Breed,
                # trait (duplicate with Name),Variance,Bayes-value,
                # F-Stat,LOD-score,Additive_Effect,Dominance_Effect,
                # Likelihood_Ratio,LS-means

                # deal with poorly formed attributes
                if re.search(r'"FlankMarkers";', attr):
                    attr = re.sub(r'FlankMarkers;', '', attr)
                attr_items = re.sub(r'"', '', attr).split(";")
                bad_attrs = set()
                for a in attr_items:
                    if not re.search(r'=', a):
                        # bad_attr_flag = True  # TODO unused
                        # remove this attribute from the list
                        bad_attrs.add(a)

                attr_set = set(attr_items) - bad_attrs
                attribute_dict = dict(item.split("=") for item in attr_set)

                qtl_num = attribute_dict.get('QTL_ID')
                if self.testMode and int(qtl_num) not in self.test_ids:
                    continue

                # make association between QTL and trait
                qtl_id = 'AQTL:' + str(qtl_num)
                model.addIndividualToGraph(qtl_id, None, geno.genoparts['QTL'])
                geno.addTaxon(taxon_id, qtl_id)

                trait_id = 'AQTLTrait:'+attribute_dict.get('trait_ID')

                # if pub is in attributes, add it to the association
                pub_id = None
                if 'PUBMED_ID' in attribute_dict.keys():
                    pub_id = attribute_dict.get('PUBMED_ID')
                    if re.match(r'ISU.*', pub_id):
                        pub_id = 'AQTLPub:' + pub_id.strip()
                        reference = Reference(g, pub_id)
                    else:
                        pub_id = 'PMID:' + pub_id.strip()
                        reference = Reference(
                            g, pub_id, Reference.ref_types['journal_article'])
                    reference.addRefToGraph()

                # Add QTL to graph
                assoc = G2PAssoc(
                    g, self.name, qtl_id, trait_id,
                    model.object_properties['is_marker_for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)
                if 'P-value' in attribute_dict.keys():
                    s = re.sub(r'<', '', attribute_dict.get('P-value'))
                    if ',' in s:
                        s = re.sub(r',', '.', s)
                    if s.isnumeric():
                        score = float(s)
                        assoc.set_score(score)

                assoc.add_association_to_graph()
                # TODO make association to breed
                # (which means making QTL feature in Breed background)

                # get location of QTL
                chromosome = re.sub(r'Chr\.', '', chromosome)
                chrom_id = makeChromID(chromosome, taxon_id, 'CHR')

                chrom_in_build_id = \
                    makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                qtl_feature = Feature(g, qtl_id, None, geno.genoparts['QTL'])
                if start_bp == '':
                    start_bp = None
                qtl_feature.addFeatureStartLocation(
                    start_bp, chrom_in_build_id, strand,
                    [Feature.types['FuzzyPosition']])
                if stop_bp == '':
                    stop_bp = None
                qtl_feature.addFeatureEndLocation(
                    stop_bp, chrom_in_build_id, strand,
                    [Feature.types['FuzzyPosition']])
                qtl_feature.addTaxonToFeature(taxon_id)
                qtl_feature.addFeatureToGraph()

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        logger.warning("Bad attribute flags in this file")
        logger.info("Done with QTL genomic mappings for %s", taxon_id)
        return
Пример #19
0
    def _process_genes(self, taxid, limit=None):
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)

        raw = '/'.join((self.rawdir, self.files[taxid]['file']))
        col = list(self.columns['bmq_attributes'])
        if taxid != '9606' and 'hgnc_id' in col:
            col.remove('hgnc_id')
        col_exp = [
            self.columns['bmq_headers'][self.columns['bmq_attributes'].index(x)]
            for x in col]

        LOG.info("Processing Ensembl genes for NCBITaxon:%s", taxid)
        with open(raw, 'r', encoding="utf8") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t')
            row = next(reader)
            if not self.check_fileheader(col_exp, row):
                pass
            for row in reader:
                ensembl_gene_id = row[col.index('ensembl_gene_id')]
                external_gene_name = row[col.index('external_gene_name')]
                description = row[col.index('description')].strip()
                gene_biotype = row[col.index('gene_biotype')].strip()
                entrezgene = row[col.index('entrezgene_id')].strip()
                ensembl_peptide_id = row[col.index('ensembl_peptide_id')].strip()
                uniprotswissprot = row[col.index('uniprotswissprot')].strip()
                hgnc_curie = None
                # in the case of human genes, we also get the hgnc id,
                if taxid == '9606' and 'hgnc_id' in col:
                    hgnc_curie = row[col.index('hgnc_id')].strip()

                if self.test_mode and entrezgene != '' and \
                        entrezgene not in self.gene_ids:
                    continue

                gene_id = 'ENSEMBL:' + ensembl_gene_id
                entrez_curie = 'NCBIGene:{}'.format(entrezgene)

                if description == '':
                    description = None

                gene_type_id = self.resolve(
                    gene_biotype, mandatory=False,
                    default=self.globaltt['polypeptide'])

                model.addClassToGraph(
                    gene_id, external_gene_name, gene_type_id, description)

                if entrezgene != '':
                    if taxid == '9606':
                        # Use HGNC for eq in human data
                        model.addXref(gene_id, entrez_curie)
                    else:
                        model.addEquivalentClass(gene_id, entrez_curie)

                if hgnc_curie is not None and hgnc_curie != '':
                    model.addEquivalentClass(gene_id, hgnc_curie)
                geno.addTaxon('NCBITaxon:' + taxid, gene_id)
                if ensembl_peptide_id is not None and ensembl_peptide_id != '':
                    peptide_curie = 'ENSEMBL:{}'.format(ensembl_peptide_id)
                    model.addIndividualToGraph(peptide_curie, None, gene_type_id)
                    geno.addGeneProduct(gene_id, peptide_curie)
                    if uniprotswissprot != '':
                        uniprot_curie = 'UniProtKB:{}'.format(uniprotswissprot)
                        model.addIndividualToGraph(uniprot_curie, None, gene_type_id)
                        geno.addGeneProduct(gene_id, uniprot_curie)
                        model.addXref(peptide_curie, uniprot_curie)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
Пример #20
0
    def _process_genes(self, limit=None):

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)
        model = Model(g)
        raw = '/'.join((self.rawdir, self.files['genes']['file']))
        line_counter = 0
        logger.info("Processing HGNC genes")

        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            # curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n  .
            for row in filereader:
                (hgnc_id,
                 symbol,
                 name,
                 locus_group,
                 locus_type,
                 status,
                 location,
                 location_sortable,
                 alias_symbol,
                 alias_name,
                 prev_symbol,
                 prev_name,
                 gene_family,
                 gene_family_id,
                 date_approved_reserved,
                 date_symbol_changed,
                 date_name_changed,
                 date_modified,
                 entrez_id,
                 ensembl_gene_id,
                 vega_id,
                 ucsc_id,
                 ena,
                 refseq_accession,
                 ccds_id,
                 uniprot_ids,
                 pubmed_id,
                 mgd_id,
                 rgd_id,
                 lsdb,
                 cosmic,
                 omim_id,
                 mirbase,
                 homeodb,
                 snornabase,
                 bioparadigms_slc,
                 orphanet,
                 pseudogene_org,
                 horde_id,
                 merops,
                 imgt,
                 iuphar,
                 kznf_gene_catalog,
                 mamit_trnadb,
                 cd,
                 lncrnadb,
                 enzyme_id,
                 intermediate_filament_db,
                 rna_central_ids) = row

                line_counter += 1

                # skip header
                if line_counter <= 1:
                    continue

                if self.testMode and entrez_id != '' \
                        and int(entrez_id) not in self.gene_ids:
                    continue

                if name == '':
                    name = None
                gene_type_id = self._get_gene_type(locus_type)
                model.addClassToGraph(hgnc_id, symbol, gene_type_id, name)
                if locus_type == 'withdrawn':
                    model.addDeprecatedClass(hgnc_id)
                else:
                    model.makeLeader(hgnc_id)
                if entrez_id != '':
                    model.addEquivalentClass(
                        hgnc_id, 'NCBIGene:' + entrez_id)
                if ensembl_gene_id != '':
                    model.addEquivalentClass(
                        hgnc_id, 'ENSEMBL:' + ensembl_gene_id)
                if omim_id != '' and "|" not in omim_id:
                    omim_curie = 'OMIM:' + omim_id
                    if not DipperUtil.is_omim_disease(omim_curie):
                        model.addEquivalentClass(hgnc_id, omim_curie)

                geno.addTaxon('NCBITaxon:9606', hgnc_id)

                # add pubs as "is about"
                if pubmed_id != '':
                    for p in re.split(r'\|', pubmed_id.strip()):
                        if str(p) != '':
                            g.addTriple(
                                'PMID:' + str(p.strip()),
                                model.object_properties['is_about'], hgnc_id)

                # add chr location
                # sometimes two are listed, like: 10p11.2 or 17q25
                # -- there are only 2 of these FRA10A and MPFD
                # sometimes listed like "1 not on reference assembly"
                # sometimes listed like 10q24.1-q24.3
                # sometimes like 11q11 alternate reference locus
                band = chrom = None
                chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]'
                chr_match = re.match(chr_pattern, location)
                if chr_match is not None and len(chr_match.groups()) > 0:
                    chrom = chr_match.group(1)
                    chrom_id = makeChromID(chrom, 'NCBITaxon:9606', 'CHR')
                    band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)'
                    band_match = re.search(band_pattern, location)
                    f = Feature(g, hgnc_id, None, None)
                    if band_match is not None and len(band_match.groups()) > 0:
                        band = band_match.group(1)
                        band = chrom + band
                        # add the chr band as the parent to this gene
                        # as a feature but assume that the band is created
                        # as a class with properties elsewhere in Monochrom
                        # TEC Monoch? Monarchdom??
                        band_id = makeChromID(band, 'NCBITaxon:9606', 'CHR')
                        model.addClassToGraph(band_id, None)
                        f.addSubsequenceOfFeature(band_id)
                    else:
                        model.addClassToGraph(chrom_id, None)
                        f.addSubsequenceOfFeature(chrom_id)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

            # end loop through file

        return
Пример #21
0
    def _process_data(self, source, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page
         https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

        line_id a CL_0000057,  #fibroblast line
            derives_from patient_id
            part_of :NIGMSrepository
            RO:model_of OMIM:disease_id

        patient id a foaf:person,
            label: "fibroblast from patient 12345 with disease X"
            member_of family_id  #what is the right thing here?
            SIO:race EFO:caucasian  #subclass of EFO:0001799
            in_taxon NCBITaxon:9606
            dc:description Literal(remark)
            RO:has_phenotype OMIM:disease_id
            GENO:has_genotype genotype_id

        family_id a owl:NamedIndividual
            foaf:page
             "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

        genotype_id a intrinsic_genotype
            GENO:has_alternate_part allelic_variant_id
            we don't necessarily know much about the genotype,
            other than the allelic variant. also there's the sex here

        pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files[source]['file']))

        LOG.info("Processing Data from %s", raw)

        if self.testMode:  # set the graph to build
            graph = self.testgraph
        else:
            graph = self.graph

        family = Family(graph)
        model = Model(graph)

        line_counter = 1
        geno = Genotype(graph)
        diputil = DipperUtil()
        col = self.files[source]['columns']
        # affords access with
        # x = row[col.index('x')].strip()

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"')
            # we can keep a close watch on changing file formats
            fileheader = next(filereader, None)
            fileheader = [c.lower() for c in fileheader]
            if col != fileheader:  # assert
                LOG.error('Expected  %s to have columns: %s', raw, col)
                LOG.error('But Found %s to have columns: %s', raw, fileheader)
                raise AssertionError('Incomming data headers have changed.')

            for row in filereader:
                line_counter += 1
                if len(row) != len(col):
                    LOG.warning('Expected %i values but find %i in  row %i',
                                len(col), len(row), line_counter)
                    continue

                # (catalog_id, description, omim_number, sample_type,
                # cell_line_available, dna_in_stock, dna_ref, gender, age,
                # race, ethnicity, affected, karyotype, relprob, mutation,
                # gene, family_id, collection, url, cat_remark, pubmed_ids,
                # family_member, variant_id, dbsnp_id, species) = row

                # example:
                # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,
                #       ,Female,26 YR,Caucasian,,,,
                # parent,,,39,NIGMS Human Genetic Cell Repository,
                # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                # 46;XX; clinically normal mother of a child with Hurler syndrome;
                #       proband not in Repository,,
                # 2,,18343,H**o sapiens

                catalog_id = row[col.index('catalog_id')].strip()

                if self.testMode and catalog_id not in self.test_lines:
                    # skip rows not in our test lines, when in test mode
                    continue

                # ###########    BUILD REQUIRED VARIABLES    ###########

                # Make the cell line ID
                cell_line_id = 'Coriell:' + catalog_id
                # Map the cell/sample type
                cell_type = self.resolve(row[col.index('sample_type')].strip())
                # on fail cell_type = self.globaltt['cell'] ?

                # Make a cell line label
                collection = row[col.index('collection')].strip()
                line_label = collection.partition(' ')[0] + '-' + catalog_id

                # Map the repository/collection
                repository = self.localtt[collection]

                # patients are uniquely identified by one of:
                # dbsnp id (which is == an individual haplotype)
                # family id + family member (if present) OR
                # probands are usually family member zero
                # cell line id
                # since some patients have >1 cell line derived from them,
                # we must make sure that the genotype is attached to
                # the patient, and can be inferred to the cell line
                # examples of repeated patients are:
                #   famid=1159, member=1; fam=152,member=1

                # Make the patient ID

                # make an anonymous patient
                patient_id = '_:person'
                fam_id = row[col.index('fam')].strip()
                fammember = row[col.index('fammember')].strip()
                if fam_id != '':
                    patient_id = '-'.join((patient_id, fam_id, fammember))
                else:
                    # make an anonymous patient
                    patient_id = '-'.join((patient_id, catalog_id))

                # properties of the individual patients:  sex, family id,
                # member/relproband, description descriptions are
                # really long and ugly SCREAMING text, so need to clean up
                # the control cases are so odd with this labeling scheme;
                # but we'll deal with it as-is for now.
                description = row[col.index('description')].strip()
                short_desc = (description.split(';')[0]).capitalize()

                gender = row[col.index('gender')].strip().lower()
                affected = row[col.index('affected')].strip()
                relprob = row[col.index('relprob')].strip()

                if affected == '':
                    affected = 'unspecified'
                elif affected in self.localtt:
                    affected = self.localtt[affected]
                else:
                    LOG.warning('Novel Affected status  %s at row: %i of %s',
                                affected, line_counter, raw)
                patient_label = ' '.join((affected, gender, relprob))
                if relprob == 'proband':
                    patient_label = ' '.join(
                        (patient_label.strip(), 'with', short_desc))
                else:
                    patient_label = ' '.join(
                        (patient_label.strip(), 'of proband with', short_desc))

                # #############    BUILD THE CELL LINE    #############

                # Adding the cell line as a typed individual.
                cell_line_reagent_id = self.globaltt['cell line']

                model.addIndividualToGraph(cell_line_id, line_label,
                                           cell_line_reagent_id)

                # add the equivalent id == dna_ref
                dna_ref = row[col.index('dna_ref')].strip()
                if dna_ref != '' and dna_ref != catalog_id:
                    equiv_cell_line = 'Coriell:' + dna_ref
                    # some of the equivalent ids are not defined
                    # in the source data; so add them
                    model.addIndividualToGraph(equiv_cell_line, None,
                                               cell_line_reagent_id)
                    model.addSameIndividual(cell_line_id, equiv_cell_line)

                # Cell line derives from patient
                geno.addDerivesFrom(cell_line_id, patient_id)
                geno.addDerivesFrom(cell_line_id, cell_type)

                # Cell line a member of repository
                family.addMember(repository, cell_line_id)

                cat_remark = row[col.index('cat_remark')].strip()

                if cat_remark != '':
                    model.addDescription(cell_line_id, cat_remark)

                # Cell age_at_sampling
                # TODO add the age nodes when modeled properly in #78
                # if (age != ''):
                # this would give a BNode that is an instance of Age.
                # but i don't know how to connect
                # the age node to the cell line? we need to ask @mbrush
                # age_id = '_'+re.sub('\s+','_',age)
                # gu.addIndividualToGraph(
                #   graph,age_id,age,self.globaltt['age'])
                # gu.addTriple(
                #   graph,age_id,self.globaltt['has measurement value'],age,
                #   True)

                # #############    BUILD THE PATIENT    #############

                # Add the patient ID as an individual.
                model.addPerson(patient_id, patient_label)
                # TODO map relationship to proband as a class
                # (what ontology?)

                # Add race of patient
                # FIXME: Adjust for subcategories based on ethnicity field
                # EDIT: There are 743 different entries for ethnicity...
                # Too many to map?
                # Add ethnicity as literal in addition to the mapped race?
                # Adjust the ethnicity txt (if using)
                # to initial capitalization to remove ALLCAPS

                # TODO race should go into the individual's background
                # and abstracted out to the Genotype class punting for now.
                # if race != '':
                #    mapped_race = self.resolve(race)
                #    if mapped_race is not None:
                #        gu.addTriple(
                #           g,patient_id,self.globaltt['race'], mapped_race)
                #        model.addSubClass(
                #           mapped_race,self.globaltt['ethnic_group'])

                # #############    BUILD THE FAMILY    #############

                # Add triples for family_id, if present.
                if fam_id != '':
                    family_comp_id = 'CoriellFamily:' + fam_id

                    family_label = ' '.join(
                        ('Family of proband with', short_desc))

                    # Add the family ID as a named individual
                    model.addIndividualToGraph(family_comp_id, family_label,
                                               self.globaltt['family'])

                    # Add the patient as a member of the family
                    family.addMemberOf(patient_id, family_comp_id)

                # #############    BUILD THE GENOTYPE   #############

                # the important things to pay attention to here are:
                # karyotype = chr rearrangements  (somatic?)
                # mutation = protein-level mutation as a label,
                # often from omim
                # gene = gene symbol - TODO get id
                # variant_id = omim variant ids (; delimited)
                # dbsnp_id = snp individual ids = full genotype?

                # note GM00633 is a good example of chromosomal variation
                # - do we have enough to capture this?
                # GM00325 has both abnormal karyotype and variation

                # make an assumption that if the taxon is blank,
                # that it is human!
                species = row[col.index('species')].strip()
                if species is None or species == '':
                    species = 'H**o sapiens'
                taxon = self.resolve(species)

                # if there's a dbSNP id,
                # this is actually the individual's genotype
                genotype_id = None
                genotype_label = None

                dbsnp_id = row[col.index('dbsnp_id')].strip()
                if dbsnp_id != '':
                    genotype_id = 'dbSNPIndividual:' + dbsnp_id

                omim_map = {}
                gvc_id = None

                # some of the karyotypes are encoded
                # with terrible hidden codes. remove them here
                # i've seen a <98> character
                karyotype = row[col.index('karyotype')].strip()
                karyotype = diputil.remove_control_characters(karyotype)
                karyotype_id = None
                if karyotype.strip() != '':
                    karyotype_id = '_:' + re.sub('MONARCH:', '',
                                                 self.make_id(karyotype))
                    # add karyotype as karyotype_variation_complement
                    model.addIndividualToGraph(
                        karyotype_id, karyotype,
                        self.globaltt['karyotype_variation_complement'])
                    # TODO break down the karyotype into parts
                    # and map into GENO. depends on #77

                    # place the karyotype in a location(s).
                    karyo_chrs = self._get_affected_chromosomes_from_karyotype(
                        karyotype)
                    for chrom in karyo_chrs:
                        chr_id = makeChromID(chrom, taxon, 'CHR')
                        # add an anonymous sequence feature,
                        # each located on chr
                        karyotype_feature_id = '-'.join((karyotype_id, chrom))
                        karyotype_feature_label = \
                            'some karyotype alteration on chr' + str(chrom)
                        feat = Feature(graph, karyotype_feature_id,
                                       karyotype_feature_label,
                                       self.globaltt['sequence_alteration'])
                        feat.addFeatureStartLocation(None, chr_id)
                        feat.addFeatureToGraph()
                        geno.addParts(karyotype_feature_id, karyotype_id,
                                      self.globaltt['has_variant_part'])

                gene = row[col.index('gene')].strip()
                mutation = row[col.index('mutation')].strip()
                if gene != '':
                    vl = gene + '(' + mutation + ')'

                # fix the variant_id so it's always in the same order
                variant_id = row[col.index('variant_id')].strip()
                vids = variant_id.split(';')
                variant_id = ';'.join(sorted(list(set(vids))))

                if karyotype.strip() != '' and not self._is_normal_karyotype(
                        karyotype):

                    gvc_id = karyotype_id
                    if variant_id != '':
                        gvc_id = '_:' + variant_id.replace(';', '-') + '-' \
                            + re.sub(r'\w*:', '', karyotype_id)
                    if mutation.strip() != '':
                        gvc_label = '; '.join((vl, karyotype))
                    else:
                        gvc_label = karyotype
                elif variant_id.strip() != '':
                    gvc_id = '_:' + variant_id.replace(';', '-')
                    gvc_label = vl
                else:
                    # wildtype?
                    pass

                # add the karyotype to the gvc.
                # use reference if normal karyotype
                karyo_rel = self.globaltt['has_variant_part']
                if self._is_normal_karyotype(karyotype):
                    karyo_rel = self.globaltt['has_reference_part']
                if karyotype_id is not None \
                        and not self._is_normal_karyotype(karyotype) \
                        and gvc_id is not None and karyotype_id != gvc_id:
                    geno.addParts(karyotype_id, gvc_id, karyo_rel)

                if variant_id.strip() != '':
                    # split the variants & add them as part of the genotype
                    # we don't necessarily know their zygosity,
                    # just that they are part of the genotype variant ids
                    # are from OMIM, so prefix as such we assume that the
                    # sequence alts will be defined in OMIM not here
                    # TODO sort the variant_id list, if the omim prefix is
                    # the same, then assume it's the locus make a hashmap
                    # of the omim id to variant id list;
                    # then build the genotype hashmap is also useful for
                    # removing the "genes" from the list of "phenotypes"

                    # will hold gene/locus id to variant list
                    omim_map = {}

                    locus_num = None
                    for var in variant_id.split(';'):
                        # handle omim-style and odd var ids
                        # like 610661.p.R401X
                        mch = re.match(r'(\d+)\.+(.*)', var.strip())
                        if mch is not None and len(mch.groups()) == 2:
                            (locus_num, var_num) = mch.groups()

                        if locus_num is not None and locus_num not in omim_map:
                            omim_map[locus_num] = [var_num]
                        else:
                            omim_map[locus_num] += [var_num]

                    for omim in omim_map:
                        # gene_id = 'OMIM:' + omim  # TODO unused
                        vslc_id = '_:' + '-'.join(
                            [omim + '.' + a for a in omim_map.get(omim)])
                        vslc_label = vl
                        # we don't really know the zygosity of
                        # the alleles at all.
                        # so the vslcs are just a pot of them
                        model.addIndividualToGraph(
                            vslc_id, vslc_label,
                            self.globaltt['variant single locus complement'])
                        for var in omim_map.get(omim):
                            # this is actually a sequence alt
                            allele1_id = 'OMIM:' + omim + '.' + var
                            geno.addSequenceAlteration(allele1_id, None)

                            # assume that the sa -> var_loc -> gene
                            # is taken care of in OMIM
                            geno.addPartsToVSLC(
                                vslc_id, allele1_id, None,
                                self.globaltt['indeterminate'],
                                self.globaltt['has_variant_part'])

                        if vslc_id != gvc_id:
                            geno.addVSLCtoParent(vslc_id, gvc_id)

                if affected == 'unaffected':
                    # let's just say that this person is wildtype
                    model.addType(patient_id, self.globaltt['wildtype'])
                elif genotype_id is None:
                    # make an anonymous genotype id (aka blank node)
                    genotype_id = '_:geno' + catalog_id.strip()

                # add the gvc
                if gvc_id is not None:
                    model.addIndividualToGraph(
                        gvc_id, gvc_label,
                        self.globaltt['genomic_variation_complement'])

                    # add the gvc to the genotype
                    if genotype_id is not None:
                        if affected == 'unaffected':
                            rel = self.globaltt['has_reference_part']
                        else:
                            rel = self.globaltt['has_variant_part']
                        geno.addParts(gvc_id, genotype_id, rel)

                    if karyotype_id is not None \
                            and self._is_normal_karyotype(karyotype):
                        if gvc_label is not None and gvc_label != '':
                            genotype_label = '; '.join((gvc_label, karyotype))
                        elif karyotype is not None:
                            genotype_label = karyotype
                        if genotype_id is None:
                            genotype_id = karyotype_id
                        else:
                            geno.addParts(karyotype_id, genotype_id,
                                          self.globaltt['has_reference_part'])
                    else:
                        genotype_label = gvc_label
                        # use the catalog id as the background
                    genotype_label += ' [' + catalog_id.strip() + ']'

                if genotype_id is not None and gvc_id is not None:
                    # only add the genotype if it has some parts
                    geno.addGenotype(genotype_id, genotype_label,
                                     self.globaltt['intrinsic_genotype'])
                    geno.addTaxon(taxon, genotype_id)
                    # add that the patient has the genotype
                    # TODO check if the genotype belongs to
                    # the cell line or to the patient
                    graph.addTriple(patient_id, self.globaltt['has_genotype'],
                                    genotype_id)
                else:
                    geno.addTaxon(taxon, patient_id)

                # TODO: Add sex/gender  (as part of the karyotype?)
                # = row[col.index('')].strip()
                # #############    DEAL WITH THE DISEASES   #############
                omim_num = row[col.index('omim_num')].strip()

                # we associate the disease to the patient
                if affected == 'affected' and omim_num != '':
                    for d in omim_num.split(';'):
                        if d is not None and d != '':
                            # if the omim number is in omim_map,
                            # then it is a gene not a pheno

                            # TEC - another place to use the mimTitle omim
                            # classifier omia & genereviews are using

                            if d not in omim_map:
                                disease_id = 'OMIM:' + d.strip()
                                # assume the label is taken care of in OMIM
                                model.addClassToGraph(disease_id, None)

                                # add the association:
                                #   the patient has the disease
                                assoc = G2PAssoc(graph, self.name, patient_id,
                                                 disease_id)
                                assoc.add_association_to_graph()

                                # this line is a model of this disease
                                # TODO abstract out model into
                                # it's own association class?
                                graph.addTriple(cell_line_id,
                                                self.globaltt['is model of'],
                                                disease_id)
                            else:
                                LOG.info('drop gene %s from disease list', d)

                # #############    ADD PUBLICATIONS   #############
                pubmed_ids = row[col.index('pubmed_ids')].strip()
                if pubmed_ids != '':
                    for s in pubmed_ids.split(';'):
                        pubmed_id = 'PMID:' + s.strip()
                        ref = Reference(graph, pubmed_id)
                        ref.setType(self.globaltt['journal article'])
                        ref.addRefToGraph()
                        graph.addTriple(pubmed_id, self.globaltt['mentions'],
                                        cell_line_id)

                if not self.testMode and (limit is not None
                                          and line_counter > limit):
                    break
        return
Пример #22
0
    def process_gaf(self, file, limit, id_map=None, eco_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", file)
        line_counter = 0
        uniprot_hit = 0
        uniprot_miss = 0
        if 7955 in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        if 6239 in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue

                if len(row) > 17 or len(row) < 15:
                    LOG.warning(
                        "Wrong number of columns %i, expected 15 or 17\n%s",
                        len(row), row)
                    continue

                if 17 > len(row) >= 15:
                    row += [""] * (17 - len(row))

                (dbase,
                 gene_num,
                 gene_symbol,
                 qualifier,
                 go_id,
                 ref,
                 eco_symbol,
                 with_or_from,
                 aspect,
                 gene_name,
                 gene_synonym,
                 object_type,
                 taxon,
                 date,
                 assigned_by,
                 annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (dbase == '' or gene_num == '' or gene_symbol == '' or
                        go_id == '' or ref == '' or eco_symbol == '' or
                        aspect == '' or object_type == '' or taxon == '' or
                        date == '' or assigned_by == ''):
                    LOG.error(
                        "Missing required part of annotation on row %d:\n"+'\t'
                        .join(row), line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None and gene_num in id_map:
                        gene_id = id_map[gene_num]
                        uniprotid = ':'.join((dbase, gene_num))
                        (dbase, gene_num) = gene_id.split(':')
                        uniprot_hit += 1
                    else:
                        # LOG.warning(
                        #   "UniProt id %s  is without a 1:1 mapping to entrez/ensembl",
                        #    gene_num)
                        uniprot_miss += 1
                        continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and not(
                        re.match(r'NCBIGene', gene_id) and
                        int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, syn.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    LOG.info(
                        ">1 taxon (%s) on line %d.  skipping", taxon, line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = eco_map[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[0]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to', qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n", str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                #######################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id+'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i):
                            LOG.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s", uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(gene_id, i)
                            geno.addReagentTargetedGene(i, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(
                                graph, self.name, targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = wbase.make_reagent_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(
                                graph, self.name, targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, i, phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[0]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(
                                    self.globaltt['experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.test_mode and limit is not None and line_counter > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %f.2%% of %i benifited from the 1/4 day id mapping download",
                uniprot_per, uniprot_tot)
        return
Пример #23
0
    def process_gaf(self, file, limit, id_map=None, eco_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", file)
        line_counter = 0
        uniprot_hit = 0
        uniprot_miss = 0
        if '7955' in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        if '6239' in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue

                if len(row) > 17 or len(row) < 15:
                    LOG.warning(
                        "Wrong number of columns %i, expected 15 or 17\n%s",
                        len(row), row)
                    continue

                if 17 > len(row) >= 15:
                    row += [""] * (17 - len(row))

                (dbase, gene_num, gene_symbol, qualifier, go_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 object_type, taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (dbase == '' or gene_num == '' or gene_symbol == ''
                        or go_id == '' or ref == '' or eco_symbol == ''
                        or aspect == '' or object_type == '' or taxon == ''
                        or date == '' or assigned_by == ''):
                    LOG.error(
                        "Missing required part of annotation on row %d:\n" +
                        '\t'.join(row), line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None and gene_num in id_map:
                        gene_id = id_map[gene_num]
                        uniprotid = ':'.join((dbase, gene_num))
                        (dbase, gene_num) = gene_id.split(':')
                        uniprot_hit += 1
                    else:
                        # LOG.warning(
                        #   "UniProt id %s  is without a 1:1 mapping to entrez/ensembl",
                        #    gene_num)
                        uniprot_miss += 1
                        continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and not (re.match(r'NCBIGene', gene_id)
                                           and int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, syn.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    LOG.info(">1 taxon (%s) on line %d.  skipping", taxon,
                             line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = eco_map[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[0]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to',
                                                   qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n",
                                str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                #######################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i):
                            LOG.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s", uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = wbase.make_reagent_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, i, phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[0]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(self.globaltt[
                                    'experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.test_mode and limit is not None and line_counter > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download",
                uniprot_per, uniprot_tot)
        return
Пример #24
0
    def _process_phenotype_data(self, limit):
        """
        NOTE: If a Strain carries more than one mutation,
        then each Mutation description,
        i.e., the set: (
            Mutation Type - Chromosome - Gene Symbol -
            Gene Name - Allele Symbol - Allele Name)
        will require a separate line.

        Note that MMRRC curates phenotypes to alleles,
        even though they distribute only one file with the
        phenotypes appearing to be associated with a strain.

        So, here we process the allele-to-phenotype relationships separately
        from the strain-to-allele relationships.

        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        line_counter = 0
        fname = '/'.join((self.rawdir, self.files['catalog']['file']))

        self.strain_hash = {}
        self.id_label_hash = {}
        genes_with_no_ids = set()
        stem_cell_class = 'CL:0000034'
        mouse_taxon = 'NCBITaxon:10090'
        geno = Genotype(g)
        with open(fname, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            for row in filereader:
                line_counter += 1
                # skip the first 3 lines which are header, etc.
                if line_counter < 4:
                    continue

                (strain_id, strain_label, strain_type_symbol, strain_state,
                 mgi_allele_id, mgi_allele_symbol, mgi_allele_name,
                 mutation_type, chrom, mgi_gene_id, mgi_gene_symbol,
                 mgi_gene_name, sds_url, accepted_date, mp_ids, pubmed_nums,
                 research_areas) = row

                if self.testMode and (strain_id not in self.test_ids) \
                        or mgi_gene_name == 'withdrawn':
                    continue

                # strip off stuff after the dash -
                # is the holding center important?
                # MMRRC:00001-UNC --> MMRRC:00001
                strain_id = re.sub(r'-\w+$', '', strain_id)

                self.id_label_hash[strain_id] = strain_label

                # get the variant or gene to save for later building of
                # the genotype
                if strain_id not in self.strain_hash:
                    self.strain_hash[strain_id] = {
                        'variants': set(),
                        'genes': set()
                    }

                # clean up the bad one
                if mgi_allele_id == 'multiple mutation':
                    logger.error("Erroneous gene id: %s", mgi_allele_id)
                    mgi_allele_id = ''

                if mgi_allele_id != '':
                    self.strain_hash[strain_id]['variants'].add(mgi_allele_id)
                    self.id_label_hash[mgi_allele_id] = mgi_allele_symbol

                    # use the following if needing to add the
                    # sequence alteration types
                    # var_type =
                    #   self._get_variant_type_from_abbrev(mutation_type)
                    # make a sequence alteration for this variant locus,
                    # and link the variation type to it
                    # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA'
                    # if self.nobnodes:
                    #     sa_id = ':'+sa_id
                    # gu.addIndividualToGraph(g, sa_id, None, var_type)
                    # geno.addSequenceAlterationToVariantLocus(sa_id,
                    #                                          mgi_allele_id)

                # scrub out any spaces
                mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id)
                if mgi_gene_id.strip() != '':
                    if re.match(r'Gene\s*ID:', mgi_gene_id, re.I):
                        mgi_gene_id = re.sub(r'Gene\s*ID:\s*', 'NCBIGene:',
                                             mgi_gene_id)
                    elif not re.match(r'MGI', mgi_gene_id):
                        logger.info("Gene id not recognized: %s", mgi_gene_id)
                        if re.match(r'\d+$', mgi_gene_id):
                            # assume that if it's all numbers, then it's MGI
                            mgi_gene_id = 'MGI:' + str(mgi_gene_id)
                            logger.info("Assuming numerics are MGI.")
                    self.strain_hash[strain_id]['genes'].add(mgi_gene_id)
                    self.id_label_hash[mgi_gene_id] = mgi_gene_symbol

                # catch some errors -
                # some things have gene labels, but no identifiers - report
                if mgi_gene_symbol.strip() != '' and mgi_gene_id == '':
                    logger.error(
                        "Gene label with no identifier for strain %s: %s",
                        strain_id, mgi_gene_symbol)
                    genes_with_no_ids.add(mgi_gene_symbol.strip())
                    # make a temp id for genes that aren't identified
                    # tmp_gene_id = '_'+mgi_gene_symbol
                    # self.id_label_hash[tmp_gene_id] = mgi_gene_symbol
                    # self.strain_hash[strain_id]['genes'].add(tmp_gene_id)

                # split apart the mp ids
                # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ...
                # mp_ids are now a comma delimited list
                # with MP terms in brackets
                phenotype_ids = []
                if mp_ids != '':
                    for i in re.split(r',', mp_ids):
                        i = i.strip()
                        mps = re.search(r'\[(.*)\]', i)
                        if mps is not None:
                            mp_id = mps.group(1).strip()
                            phenotype_ids.append(mp_id)

                # pubmed ids are space delimited
                pubmed_ids = []
                if pubmed_nums.strip() != '':
                    for i in re.split(r'\s+', pubmed_nums):
                        pmid = 'PMID:' + i.strip()
                        pubmed_ids.append(pmid)
                        r = Reference(g, pmid,
                                      Reference.ref_types['journal_article'])
                        r.addRefToGraph()

                # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001
                # is a good example of 4 genotype parts

                model.addClassToGraph(mouse_taxon, None)
                if research_areas.strip() == '':
                    research_areas = None
                else:
                    research_areas = 'Research Areas: ' + research_areas
                strain_type = mouse_taxon
                if strain_state == 'ES':
                    strain_type = stem_cell_class
                model.addIndividualToGraph(
                    strain_id, strain_label, strain_type,
                    research_areas)  # an inst of mouse??
                model.makeLeader(strain_id)

                # phenotypes are associated with the alleles
                for pid in phenotype_ids:
                    # assume the phenotype label is in the ontology
                    model.addClassToGraph(pid, None)
                    if mgi_allele_id is not None and mgi_allele_id != '':
                        assoc = G2PAssoc(
                            g, self.name, mgi_allele_id, pid,
                            model.object_properties['has_phenotype'])
                        for p in pubmed_ids:
                            assoc.add_source(p)
                        assoc.add_association_to_graph()
                    else:
                        logger.info("Phenotypes and no allele for %s",
                                    strain_id)

                if not self.testMode and (limit is not None
                                          and line_counter > limit):
                    break

            # now that we've collected all of the variant information, build it
            # we don't know their zygosities
            for s in self.strain_hash:
                h = self.strain_hash.get(s)
                variants = h['variants']
                genes = h['genes']
                vl_set = set()
                # make variant loci for each gene
                if len(variants) > 0:
                    for v in variants:
                        vl_id = v
                        vl_symbol = self.id_label_hash[vl_id]
                        geno.addAllele(vl_id, vl_symbol,
                                       geno.genoparts['variant_locus'])
                        vl_set.add(vl_id)
                        if len(variants) == 1 and len(genes) == 1:
                            for gene in genes:
                                geno.addAlleleOfGene(vl_id, gene)
                        else:
                            geno.addAllele(vl_id, vl_symbol)
                else:  # len(vars) == 0
                    # it's just anonymous variants in some gene
                    for gene in genes:
                        vl_id = '_:' + re.sub(r':', '', gene) + '-VL'
                        vl_symbol = self.id_label_hash[gene] + '<?>'
                        self.id_label_hash[vl_id] = vl_symbol
                        geno.addAllele(vl_id, vl_symbol,
                                       geno.genoparts['variant_locus'])
                        geno.addGene(gene, self.id_label_hash[gene])
                        geno.addAlleleOfGene(vl_id, gene)
                        vl_set.add(vl_id)

                # make the vslcs
                vl_list = sorted(vl_set)
                vslc_list = []
                for vl in vl_list:
                    # for unknown zygosity
                    vslc_id = re.sub(r'^_', '', vl) + 'U'
                    vslc_id = re.sub(r':', '', vslc_id)
                    vslc_id = '_:' + vslc_id
                    vslc_label = self.id_label_hash[vl] + '/?'
                    self.id_label_hash[vslc_id] = vslc_label
                    vslc_list.append(vslc_id)
                    geno.addPartsToVSLC(
                        vslc_id, vl, None, geno.zygosity['indeterminate'],
                        geno.object_properties['has_alternate_part'], None)
                    model.addIndividualToGraph(
                        vslc_id, vslc_label,
                        geno.genoparts['variant_single_locus_complement'])
                if len(vslc_list) > 0:
                    if len(vslc_list) > 1:
                        gvc_id = '-'.join(vslc_list)
                        gvc_id = re.sub(r'_|:', '', gvc_id)
                        gvc_id = '_:' + gvc_id
                        gvc_label = \
                            '; '.join(self.id_label_hash[v] for v in vslc_list)
                        model.addIndividualToGraph(
                            gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])
                        for vslc_id in vslc_list:
                            geno.addVSLCtoParent(vslc_id, gvc_id)
                    else:
                        # the GVC == VSLC, so don't have to make an extra piece
                        gvc_id = vslc_list.pop()
                        gvc_label = self.id_label_hash[gvc_id]

                    genotype_label = gvc_label + ' [n.s.]'
                    bkgd_id = \
                        re.sub(r':', '', '-'.join(
                            (geno.genoparts['unspecified_genomic_background'],
                             s)))
                    genotype_id = '-'.join((gvc_id, bkgd_id))
                    bkgd_id = '_:' + bkgd_id
                    geno.addTaxon(mouse_taxon, bkgd_id)
                    geno.addGenomicBackground(
                        bkgd_id, 'unspecified (' + s + ')',
                        geno.genoparts['unspecified_genomic_background'],
                        "A placeholder for the " +
                        "unspecified genetic background for " + s)
                    geno.addGenomicBackgroundToGenotype(
                        bkgd_id, genotype_id,
                        geno.genoparts['unspecified_genomic_background'])
                    geno.addParts(gvc_id, genotype_id,
                                  geno.object_properties['has_alternate_part'])
                    geno.addGenotype(genotype_id, genotype_label)
                    g.addTriple(s, geno.object_properties['has_genotype'],
                                genotype_id)
                else:
                    # logger.debug(
                    #   "Strain %s is not making a proper genotype.", s)
                    pass

            logger.warning(
                "The following gene symbols did not list identifiers: %s",
                str(sorted(list(genes_with_no_ids))))

        return
Пример #25
0
    def _process_haplotype(
            self, hap_id, hap_label, chrom_num, chrom_pos, context,
            risk_allele_frequency, mapped_gene, so_ontology):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        # add the feature to the graph
        hap_description = None
        if risk_allele_frequency != '' and risk_allele_frequency != 'NR':
            hap_description = str(risk_allele_frequency) + ' [risk allele frequency]'

        model.addIndividualToGraph(
            hap_id, hap_label.strip(), self.globaltt['haplotype'], hap_description)
        geno.addTaxon(self.globaltt["H**o sapiens"], hap_id)

        snp_labels = re.split(r';\s?', hap_label)
        chrom_nums = re.split(r';\s?', chrom_num)
        chrom_positions = re.split(r';\s?', chrom_pos)
        context_list = re.split(r';\s?', context)
        mapped_genes = re.split(r';\s?', mapped_gene)
        snp_curies = list()

        for index, snp in enumerate(snp_labels):
            snp_curie, snp_type = self._get_curie_and_type_from_id(snp)
            if snp_type is None:
                # make blank node
                snp_curie = self.make_id(snp, "_")

            graph.addTriple(hap_id, self.globaltt['has_variant_part'], snp_curie)
            snp_curies.append(snp_curie)

        # courtesy http://stackoverflow.com/a/16720915
        length = len(snp_labels)
        if not all(len(lst) == length
                   for lst in [chrom_nums, chrom_positions, context_list]):
            LOG.warning(
                "Unexpected data field for haplotype %s \n "
                "will not add snp details", hap_label)
            return

        variant_in_gene_count = 0
        for index, snp_curie in enumerate(snp_curies):
            self._add_snp_to_graph(
                snp_curie, snp_labels[index], chrom_nums[index],
                chrom_positions[index], context_list[index])

            if len(mapped_genes) == len(snp_labels):
                so_class = self.resolve(context_list[index])
                # removed the '+' for recursive  one-or-more rdfs:subClassOf  paths
                # just so it did not return an empty graph
                so_query = """
SELECT ?variant_label
    WHERE {{
        {0} rdfs:subClassOf {1} ;
        rdfs:label ?variant_label .
    }}
                """.format(so_class, self.globaltt['gene_variant'])

                query_result = so_ontology.query(so_query)

                if len(list(query_result)) == 1:
                    gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[index])

                    if gene_id is not None:
                        geno.addAffectedLocus(snp_curie, gene_id)
                        geno.addAffectedLocus(hap_id, gene_id)
                        variant_in_gene_count += 1

                gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[index])
                if gene_id is not None:
                    graph.addTriple(
                        snp_curie, self.resolve(context_list[index]), gene_id)

            else:
                LOG.warning(
                    "More mapped genes than snps, cannot disambiguate for %s",
                    hap_label)

        # Seperate in case we want to apply a different relation
        # If not this is redundant with triples added above
        if len(mapped_genes) == variant_in_gene_count and len(set(mapped_genes)) == 1:
            gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[0])
            geno.addAffectedLocus(hap_id, gene_id)

        return
Пример #26
0
    def _process_genes(self, limit=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        geno = Genotype(graph)
        model = Model(graph)

        raw = '/'.join((self.rawdir, self.files['genes']['file']))
        col = self.files['genes']['columns']
        LOG.info("Processing HGNC genes")

        chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]')
        band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)')

        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')

            row = next(filereader)
            if not self.check_fileheader(col, row):
                exit(-1)

            for row in filereader:
                # To generate:
                # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' |
                # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g"

                hgnc_id = row[col.index('hgnc_id')].strip()
                symbol = row[col.index('symbol')].strip()
                name = row[col.index('name')].strip()
                # locus_group = row[col.index('locus_group')]
                locus_type = row[col.index('locus_type')].strip()
                # status = row[col.index('status')]
                location = row[col.index('location')].strip()
                # location_sortable = row[col.index('location_sortable')]
                # alias_symbol = row[col.index('alias_symbol')]
                # alias_name = row[col.index('alias_name')]
                # prev_symbol = row[col.index('prev_symbol')]
                # prev_name = row[col.index('prev_name')]
                # gene_family = row[col.index('gene_family')]
                # gene_family_id = row[col.index('gene_family_id')]
                # date_approved_reserved = row[col.index('date_approved_reserved')]
                # date_symbol_changed = row[col.index('date_symbol_changed')]
                # date_name_changed = row[col.index('date_name_changed')]
                # date_modified = row[col.index('date_modified')]
                entrez_id = row[col.index('entrez_id')].strip()
                ensembl_gene_id = row[col.index('ensembl_gene_id')].strip()
                # vega_id = row[col.index('vega_id')]
                # ucsc_id = row[col.index('ucsc_id')]
                # ena = row[col.index('ena')]
                # refseq_accession = row[col.index('refseq_accession')]
                # ccds_id = row[col.index('ccds_id')]
                # uniprot_ids = row[col.index('uniprot_ids')]
                pubmed_ids = row[col.index('pubmed_id')].strip()  # pipe seperated!
                # mgd_id = row[col.index('mgd_id')]
                # rgd_id = row[col.index('rgd_id')]
                # lsdb = row[col.index('lsdb')]
                # cosmic = row[col.index('cosmic')]
                omim_ids = row[col.index('omim_id')].strip()  # pipe seperated!
                # mirbase = row[col.index('mirbase')]
                # homeodb = row[col.index('homeodb')]
                # snornabase = row[col.index('snornabase')]
                # bioparadigms_slc = row[col.index('bioparadigms_slc')]
                # orphanet = row[col.index('orphanet')]
                # pseudogene.org = row[col.index('pseudogene.org')]
                # horde_id = row[col.index('horde_id')]
                # merops = row[col.index('merops')]
                # imgt = row[col.index('imgt')]
                # iuphar = row[col.index('iuphar')]
                # kznf_gene_catalog = row[col.index('kznf_gene_catalog')]
                # mamit_trnadb = row[col.index('mamit-trnadb')]
                # cd = row[col.index('cd')]
                # lncrnadb = row[col.index('lncrnadb')]
                # enzyme_id = row[col.index('enzyme_id')]
                # intermediate_filament_db = row[col.index('intermediate_filament_db')]
                # rna_central_ids = row[col.index('rna_central_ids')]
                # lncipedia = row[col.index('lncipedia')]
                # gtrnadb = row[col.index('gtrnadb')]

                if self.test_mode and entrez_id != '' and \
                        entrez_id not in self.gene_ids:
                    continue

                if name == '':
                    name = None

                if locus_type == 'withdrawn':
                    model.addDeprecatedClass(hgnc_id)
                else:
                    gene_type_id = self.resolve(locus_type, False)  # withdrawn -> None?
                    if gene_type_id != locus_type:
                        model.addClassToGraph(hgnc_id, symbol, gene_type_id, name)
                    model.makeLeader(hgnc_id)

                if entrez_id != '':
                    model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id)

                if ensembl_gene_id != '':
                    model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id)

                for omim_id in omim_ids.split('|'):
                    if omim_id in self.omim_replaced:
                        repl = self.omim_replaced[omim_id]
                        LOG.warning('%s is replaced with %s', omim_id, repl)
                        for omim in repl:
                            if self.omim_type[omim] == self.globaltt['gene']:
                                omim_id = omim

                    if omim_id in self.omim_type and \
                            self.omim_type[omim_id] == self.globaltt['gene']:
                        model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id)

                geno.addTaxon(self.hs_txid, hgnc_id)

                # add pubs as "is about"
                for pubmed_id in pubmed_ids.split('|'):
                    graph.addTriple(
                        'PMID:' + pubmed_id, self.globaltt['is_about'], hgnc_id)

                # add chr location
                # sometimes two are listed, like: 10p11.2 or 17q25
                # -- there are only 2 of these FRA10A and MPFD
                # sometimes listed like "1 not on reference assembly"
                # sometimes listed like 10q24.1-q24.3
                # sometimes like 11q11 alternate reference locus
                band = chrom = None
                chr_match = chr_pattern.match(location)
                if chr_match is not None and len(chr_match.groups()) > 0:
                    chrom = chr_match.group(1)
                    chrom_id = makeChromID(chrom, self.hs_txid, 'CHR')
                    band_match = band_pattern.search(location)
                    feat = Feature(graph, hgnc_id, None, None)
                    if band_match is not None and len(band_match.groups()) > 0:
                        band = band_match.group(1)
                        band = chrom + band
                        # add the chr band as the parent to this gene
                        # as a feature but assume that the band is created
                        # as a class with properties elsewhere in Monochrom
                        band_id = makeChromID(band, self.hs_txid, 'CHR')
                        model.addClassToGraph(band_id, None)
                        feat.addSubsequenceOfFeature(band_id)
                    else:
                        model.addClassToGraph(chrom_id, None)
                        feat.addSubsequenceOfFeature(chrom_id)

                if not self.test_mode and limit is not None and \
                        filereader.line_num > limit:
                    break
Пример #27
0
    def add_orthologs_by_gene_group(self, graph, gene_ids):
        """
        This will get orthologies between human and other vertebrate genomes
        based on the gene_group annotation pipeline from NCBI.
        More information 9can be learned here:
        http://www.ncbi.nlm.nih.gov/news/03-13-2014-gene-provides-orthologs-regions/
        The method for associations is described in
        [PMCID:3882889](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3882889/)
        == [PMID:24063302](http://www.ncbi.nlm.nih.gov/pubmed/24063302/).
        Because these are only between human and vertebrate genomes,
        they will certainly miss out on very distant orthologies,
        and should not be considered complete.

        We do not run this within the NCBI parser itself;
        rather it is a convenience function for others parsers to call.

        :param graph:
        :param gene_ids:  Gene ids to fetch the orthology
        :return:

        """
        src_key = 'gene_group'
        LOG.info("getting gene groups")
        src_file = '/'.join((self.rawdir, self.files[src_key]['file']))
        found_counter = 0
        # because many of the orthologous groups are grouped by human gene,
        # we need to do this by generating two-way hash

        # group_id => orthologs
        # ortholog id => group
        # this will be the fastest approach, though not memory-efficient.
        geno = Genotype(graph)
        model = Model(graph)
        group_to_orthology = {}
        gene_to_group = {}
        gene_to_taxon = {}
        col = self.files[src_key]['columns']

        with gzip.open(src_file, 'rb') as tsv:
            row = tsv.readline().decode().strip().split('\t')
            row[0] = row[0][1:]  # strip octothorp
            if not self.check_fileheader(col, row):
                pass
            for row in tsv:
                row = row.decode().strip().split('\t')
                tax_a = row[col.index('tax_id')]
                gene_a = row[col.index('GeneID')].strip()
                rel = row[col.index('relationship')]
                tax_b = row[col.index('Other_tax_id')]
                gene_b = row[col.index('Other_GeneID')].strip()

                if rel != 'Ortholog':
                    continue

                if gene_a not in group_to_orthology:
                    group_to_orthology[gene_a] = set()
                group_to_orthology[gene_a].add(gene_b)

                if gene_b not in gene_to_group:
                    gene_to_group[gene_b] = set()
                gene_to_group[gene_b].add(gene_a)

                gene_to_taxon[gene_a] = tax_a
                gene_to_taxon[gene_b] = tax_b

                # also add the group lead as a member of the group
                group_to_orthology[gene_a].add(gene_a)

            # end loop through gene_group file
        LOG.debug("Finished hashing gene groups")
        LOG.debug("Making orthology associations")
        for gid in gene_ids:
            gene_num = re.sub(r'NCBIGene:', '', gid)
            group_nums = gene_to_group.get(gene_num)
            if group_nums is not None:
                for group_num in group_nums:
                    orthologs = group_to_orthology.get(group_num)
                    if orthologs is not None:
                        for orth in orthologs:
                            oid = 'NCBIGene:' + str(orth)
                            model.addClassToGraph(oid, None,
                                                  self.globaltt['gene'])
                            otaxid = 'NCBITaxon:' + str(gene_to_taxon[orth])
                            geno.addTaxon(otaxid, oid)
                            assoc = OrthologyAssoc(graph, self.name, gid, oid)
                            assoc.add_source('PMID:24063302')
                            assoc.add_association_to_graph()
                            # todo get gene label for orthologs -
                            # this could get expensive
                            found_counter += 1

            # finish loop through annotated genes
        LOG.info("Made %d orthology relationships for %d genes", found_counter,
                 len(gene_ids))
Пример #28
0
    def _process_data(self, raw, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

            line_id a CL_0000057,  #fibroblast line
                derives_from patient_id
                part_of :NIGMSrepository
                RO:model_of OMIM:disease_id

            patient id a foaf:person,
                label: "fibroblast from patient 12345 with disease X"
                member_of family_id  #what is the right thing here?
                SIO:race EFO:caucasian  #subclass of EFO:0001799
                in_taxon NCBITaxon:9606
                dc:description Literal(remark)
                RO:has_phenotype OMIM:disease_id
                GENO:has_genotype genotype_id

            family_id a owl:NamedIndividual
                foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

            genotype_id a intrinsic_genotype
                GENO:has_alternate_part allelic_variant_id
                we don't necessarily know much about the genotype,
                other than the allelic variant. also there's the sex here

            pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:
        """
        logger.info("Processing Data from %s", raw)
        gu = GraphUtils(curie_map.get())

        if self.testMode:      # set the graph to build
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        geno = Genotype(g)
        du = DipperUtil()

        gu.loadProperties(g, geno.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                if not row:
                    pass
                else:
                    line_counter += 1

                    (catalog_id, description, omim_number, sample_type,
                     cell_line_available, dna_in_stock, dna_ref, gender, age,
                     race, ethnicity, affected, karyotype, relprob, mutation,
                     gene, family_id, collection, url, cat_remark, pubmed_ids,
                     family_member, variant_id, dbsnp_id, species) = row

                    # example:
                    # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,,
                    # parent,,,39,NIGMS Human Genetic Cell Repository,
                    # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                    # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,,
                    # 2,,18343,H**o sapiens

                    if self.testMode and catalog_id not in self.test_lines:
                        # skip rows not in our test lines, when in test mode
                        continue

                    # ###########    BUILD REQUIRED VARIABLES    ###########

                    # Make the cell line ID
                    cell_line_id = 'Coriell:'+catalog_id.strip()

                    # Map the cell/sample type
                    cell_type = self._map_cell_type(sample_type)

                    # Make a cell line label
                    line_label = \
                        collection.partition(' ')[0]+'-'+catalog_id.strip()

                    # Map the repository/collection
                    repository = self._map_collection(collection)

                    # patients are uniquely identified by one of:
                    # dbsnp id (which is == an individual haplotype)
                    # family id + family member (if present) OR
                    # probands are usually family member zero
                    # cell line id
                    # since some patients have >1 cell line derived from them,
                    # we must make sure that the genotype is attached to
                    # the patient, and can be inferred to the cell line
                    # examples of repeated patients are:
                    #   famid=1159, member=1; fam=152,member=1

                    # Make the patient ID

                    # make an anonymous patient
                    patient_id = '_person'
                    if self.nobnodes:
                        patient_id = ':'+patient_id
                    if family_id != '':
                        patient_id = \
                            '-'.join((patient_id, family_id, family_member))
                    else:
                        # make an anonymous patient
                        patient_id = '-'.join((patient_id, catalog_id.strip()))

                    # properties of the individual patients:  sex, family id,
                    # member/relproband, description descriptions are
                    # really long and ugly SCREAMING text, so need to clean up
                    # the control cases are so odd with this labeling scheme;
                    # but we'll deal with it as-is for now.
                    short_desc = (description.split(';')[0]).capitalize()
                    if affected == 'Yes':
                        affected = 'affected'
                    elif affected == 'No':
                        affected = 'unaffected'
                    gender = gender.lower()
                    patient_label = ' '.join((affected, gender, relprob))
                    if relprob == 'proband':
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'with', short_desc))
                    else:
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'of proband with',
                                 short_desc))

                    # #############    BUILD THE CELL LINE    #############

                    # Adding the cell line as a typed individual.
                    cell_line_reagent_id = 'CLO:0000031'

                    gu.addIndividualToGraph(
                        g, cell_line_id, line_label, cell_line_reagent_id)

                    # add the equivalent id == dna_ref
                    if dna_ref != '' and dna_ref != catalog_id:
                        equiv_cell_line = 'Coriell:'+dna_ref
                        # some of the equivalent ids are not defined
                        # in the source data; so add them
                        gu.addIndividualToGraph(
                            g, equiv_cell_line, None, cell_line_reagent_id)
                        gu.addSameIndividual(g, cell_line_id, equiv_cell_line)

                    # Cell line derives from patient
                    geno.addDerivesFrom(cell_line_id, patient_id)
                    geno.addDerivesFrom(cell_line_id, cell_type)

                    # Cell line a member of repository
                    gu.addMember(g, repository, cell_line_id)

                    if cat_remark != '':
                        gu.addDescription(g, cell_line_id, cat_remark)

                    # Cell age_at_sampling
                    # TODO add the age nodes when modeled properly in #78
                    # if (age != ''):
                        # this would give a BNode that is an instance of Age.
                        # but i don't know how to connect
                        # the age node to the cell line? we need to ask @mbrush
                        # age_id = '_'+re.sub('\s+','_',age)
                        # gu.addIndividualToGraph(
                        #   g,age_id,age,self.terms['age'])
                        # gu.addTriple(
                        #   g,age_id,self.properties['has_measurement'],age,
                        #   True)

                    # #############    BUILD THE PATIENT    #############

                    # Add the patient ID as an individual.
                    gu.addPerson(g, patient_id, patient_label)
                    # TODO map relationship to proband as a class
                    # (what ontology?)

                    # Add race of patient
                    # FIXME: Adjust for subcategories based on ethnicity field
                    # EDIT: There are 743 different entries for ethnicity...
                    # Too many to map?
                    # Add ethnicity as literal in addition to the mapped race?
                    # Adjust the ethnicity txt (if using)
                    # to initial capitalization to remove ALLCAPS

                    # TODO race should go into the individual's background
                    # and abstracted out to the Genotype class punting for now.
                    # if race != '':
                    #    mapped_race = self._map_race(race)
                    #    if mapped_race is not None:
                    #        gu.addTriple(
                    #           g,patient_id,self.terms['race'],mapped_race)
                    #        gu.addSubclass(
                    #           g,self.terms['ethnic_group'],mapped_race)

                    # #############    BUILD THE FAMILY    #############

                    # Add triples for family_id, if present.
                    if family_id != '':
                        family_comp_id = 'CoriellFamily:'+family_id

                        family_label = \
                            ' '.join(('Family of proband with', short_desc))

                        # Add the family ID as a named individual
                        gu.addIndividualToGraph(
                            g, family_comp_id, family_label,
                            geno.genoparts['family'])

                        # Add the patient as a member of the family
                        gu.addMemberOf(g, patient_id, family_comp_id)

                    # #############    BUILD THE GENOTYPE   #############

                    # the important things to pay attention to here are:
                    # karyotype = chr rearrangements  (somatic?)
                    # mutation = protein-level mutation as a label,
                    # often from omim
                    # gene = gene symbol - TODO get id
                    # variant_id = omim variant ids (; delimited)
                    # dbsnp_id = snp individual ids = full genotype?

                    # note GM00633 is a good example of chromosomal variation
                    # - do we have enough to capture this?
                    # GM00325 has both abnormal karyotype and variation

                    # make an assumption that if the taxon is blank,
                    # that it is human!
                    if species is None or species == '':
                        species = 'H**o sapiens'
                    taxon = self._map_species(species)

                    # if there's a dbSNP id,
                    # this is actually the individual's genotype
                    genotype_id = None
                    genotype_label = None
                    if dbsnp_id != '':
                        genotype_id = 'dbSNPIndividual:'+dbsnp_id.strip()

                    omim_map = {}
                    gvc_id = None

                    # some of the karyotypes are encoded
                    # with terrible hidden codes. remove them here
                    # i've seen a <98> character
                    karyotype = du.remove_control_characters(karyotype)
                    karyotype_id = None
                    if karyotype.strip() != '':
                        karyotype_id = \
                            '_'+re.sub('MONARCH:', '', self.make_id(karyotype))
                        if self.nobnodes:
                            karyotype_id = ':'+karyotype_id
                        # add karyotype as karyotype_variation_complement
                        gu.addIndividualToGraph(
                            g, karyotype_id, karyotype,
                            geno.genoparts['karyotype_variation_complement'])
                        # TODO break down the karyotype into parts
                        # and map into GENO. depends on #77

                        # place the karyotype in a location(s).
                        karyo_chrs = \
                            self._get_affected_chromosomes_from_karyotype(
                                karyotype)
                        for c in karyo_chrs:
                            chr_id = makeChromID(c, taxon, 'CHR')
                            # add an anonymous sequence feature,
                            # each located on chr
                            karyotype_feature_id = '-'.join((karyotype_id, c))
                            karyotype_feature_label = \
                                'some karyotype alteration on chr'+str(c)
                            f = Feature(
                                karyotype_feature_id, karyotype_feature_label,
                                geno.genoparts['sequence_alteration'])
                            f.addFeatureStartLocation(None, chr_id)
                            f.addFeatureToGraph(g)
                            f.loadAllProperties(g)
                            geno.addParts(
                                karyotype_feature_id, karyotype_id,
                                geno.object_properties['has_alternate_part'])

                    if gene != '':
                        vl = gene+'('+mutation+')'

                    # fix the variant_id so it's always in the same order
                    vids = variant_id.split(';')
                    variant_id = ';'.join(sorted(list(set(vids))))

                    if karyotype.strip() != '' \
                            and not self._is_normal_karyotype(karyotype):
                        mutation = mutation.strip()
                        gvc_id = karyotype_id
                        if variant_id != '':
                            gvc_id = '_' + variant_id.replace(';', '-') + '-' \
                                    + re.sub(r'\w*:', '', karyotype_id)
                        if mutation.strip() != '':
                            gvc_label = '; '.join((vl, karyotype))
                        else:
                            gvc_label = karyotype
                    elif variant_id.strip() != '':
                        gvc_id = '_' + variant_id.replace(';', '-')
                        gvc_label = vl
                    else:
                        # wildtype?
                        pass

                    if gvc_id is not None and gvc_id != karyotype_id \
                            and self.nobnodes:
                        gvc_id = ':'+gvc_id

                    # add the karyotype to the gvc.
                    # use reference if normal karyotype
                    karyo_rel = geno.object_properties['has_alternate_part']
                    if self._is_normal_karyotype(karyotype):
                        karyo_rel = \
                            geno.object_properties['has_reference_part']
                    if karyotype_id is not None \
                            and not self._is_normal_karyotype(karyotype) \
                            and gvc_id is not None and karyotype_id != gvc_id:
                        geno.addParts(karyotype_id, gvc_id, karyo_rel)

                    if variant_id.strip() != '':
                        # split the variants & add them as part of the genotype
                        # we don't necessarily know their zygosity,
                        # just that they are part of the genotype variant ids
                        # are from OMIM, so prefix as such we assume that the
                        # sequence alts will be defined in OMIM not here
                        # TODO sort the variant_id list, if the omim prefix is
                        # the same, then assume it's the locus make a hashmap
                        # of the omim id to variant id list;
                        # then build the genotype hashmap is also useful for
                        # removing the "genes" from the list of "phenotypes"

                        # will hold gene/locus id to variant list
                        omim_map = {}

                        locus_num = None
                        for v in variant_id.split(';'):
                            # handle omim-style and odd var ids
                            # like 610661.p.R401X
                            m = re.match(r'(\d+)\.+(.*)', v.strip())
                            if m is not None and len(m.groups()) == 2:
                                (locus_num, var_num) = m.groups()

                            if locus_num is not None \
                                    and locus_num not in omim_map:
                                omim_map[locus_num] = [var_num]
                            else:
                                omim_map[locus_num] += [var_num]

                        for o in omim_map:
                            # gene_id = 'OMIM:' + o  # TODO unused
                            vslc_id = \
                                '_' + '-'.join(
                                    [o + '.' + a for a in omim_map.get(o)])
                            if self.nobnodes:
                                vslc_id = ':'+vslc_id
                            vslc_label = vl
                            # we don't really know the zygosity of
                            # the alleles at all.
                            # so the vslcs are just a pot of them
                            gu.addIndividualToGraph(
                                g, vslc_id, vslc_label,
                                geno.genoparts[
                                    'variant_single_locus_complement'])
                            for v in omim_map.get(o):
                                # this is actually a sequence alt
                                allele1_id = 'OMIM:'+o+'.'+v
                                geno.addSequenceAlteration(allele1_id, None)

                                # assume that the sa -> var_loc -> gene
                                # is taken care of in OMIM
                                geno.addPartsToVSLC(
                                    vslc_id, allele1_id, None,
                                    geno.zygosity['indeterminate'],
                                    geno.object_properties[
                                        'has_alternate_part'])

                            if vslc_id != gvc_id:
                                geno.addVSLCtoParent(vslc_id, gvc_id)

                    if affected == 'unaffected':
                        # let's just say that this person is wildtype
                        gu.addType(g, patient_id, geno.genoparts['wildtype'])
                    elif genotype_id is None:
                        # make an anonymous genotype id
                        genotype_id = '_geno'+catalog_id.strip()
                        if self.nobnodes:
                            genotype_id = ':'+genotype_id

                    # add the gvc
                    if gvc_id is not None:
                        gu.addIndividualToGraph(
                            g, gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])

                        # add the gvc to the genotype
                        if genotype_id is not None:
                            if affected == 'unaffected':
                                rel = \
                                    geno.object_properties[
                                        'has_reference_part']
                            else:
                                rel = \
                                    geno.object_properties[
                                        'has_alternate_part']
                            geno.addParts(gvc_id, genotype_id, rel)
                        if karyotype_id is not None \
                                and self._is_normal_karyotype(karyotype):
                            if gvc_label is not None and gvc_label != '':
                                genotype_label = \
                                    '; '.join((gvc_label, karyotype))
                            else:
                                genotype_label = karyotype
                            if genotype_id is None:
                                genotype_id = karyotype_id
                            else:
                                geno.addParts(
                                    karyotype_id, genotype_id,
                                    geno.object_properties[
                                        'has_reference_part'])
                        else:
                            genotype_label = gvc_label
                            # use the catalog id as the background
                        genotype_label += ' ['+catalog_id.strip()+']'

                    if genotype_id is not None and gvc_id is not None:
                        # only add the genotype if it has some parts
                        geno.addGenotype(
                            genotype_id, genotype_label,
                            geno.genoparts['intrinsic_genotype'])
                        geno.addTaxon(taxon, genotype_id)
                        # add that the patient has the genotype
                        # TODO check if the genotype belongs to
                        # the cell line or to the patient
                        gu.addTriple(
                            g, patient_id,
                            geno.properties['has_genotype'], genotype_id)
                    else:
                        geno.addTaxon(taxon, patient_id)

                    # TODO: Add sex/gender  (as part of the karyotype?)

                    # #############    DEAL WITH THE DISEASES   #############

                    # we associate the disease to the patient
                    if affected == 'affected':
                        if omim_number != '':
                            for d in omim_number.split(';'):
                                if d is not None and d != '':
                                    # if the omim number is in omim_map,
                                    # then it is a gene not a pheno
                                    if d not in omim_map:
                                        disease_id = 'OMIM:'+d.strip()
                                        # assume the label is taken care of
                                        gu.addClassToGraph(g, disease_id, None)

                                        # add the association:
                                        #   the patient has the disease
                                        assoc = G2PAssoc(
                                            self.name, patient_id, disease_id)
                                        assoc.add_association_to_graph(g)

                                        # this line is a model of this disease
                                        # TODO abstract out model into
                                        # it's own association class?
                                        gu.addTriple(
                                            g, cell_line_id,
                                            gu.properties['model_of'],
                                            disease_id)
                                    else:
                                        logger.info(
                                            'removing %s from disease list ' +
                                            'since it is a gene', d)

                    # #############    ADD PUBLICATIONS   #############

                    if pubmed_ids != '':
                        for s in pubmed_ids.split(';'):
                            pubmed_id = 'PMID:'+s.strip()
                            ref = Reference(pubmed_id)
                            ref.setType(Reference.ref_types['journal_article'])
                            ref.addRefToGraph(g)
                            gu.addTriple(
                                g, pubmed_id, gu.properties['mentions'],
                                cell_line_id)

                    if not self.testMode \
                            and (limit is not None and line_counter > limit):
                        break

            Assoc(self.name).load_all_properties(g)

        return
Пример #29
0
    def _process_qtls_genomic_location(
            self, raw, src_key, txid, build_id, build_label, common_name, limit=None):
        """
        This method

        Triples created:

        :param limit:
        :return:
        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        geno = Genotype(graph)
        # assume that chrs get added to the genome elsewhere

        taxon_curie = 'NCBITaxon:' + txid
        eco_id = self.globaltt['quantitative trait analysis evidence']
        LOG.info("Processing QTL locations for %s from %s", taxon_curie, raw)
        with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            # no header in GFF, so no header checking
            col = self.files[src_key]['columns']
            col_len = len(col)
            for row in reader:
                if row[0][0] == '#':
                    # LOG.info(row)
                    continue

                if len(row) != col_len and ''.join(row[col_len:]) != '':
                    LOG.warning(
                        "Problem parsing in %s row %s\n"
                        "got %s cols but expected %s",
                        raw, reader.line_num, len(row), col_len)
                    LOG.info(row)
                    continue

                chromosome = row[col.index('SEQNAME')].strip()
                # qtl_source = row[col.index('SOURCE')].strip()
                # qtl_type = row[col.index('FEATURE')].strip()
                start_bp = row[col.index('START')].strip()
                stop_bp = row[col.index('END')].strip()
                # score = row[col.index('SCORE')].strip()
                strand = row[col.index('STRAND')].strip()
                # frame = row[col.index('FRAME')].strip()
                attr = row[col.index('ATTRIBUTE')].strip()

                example = '''
Chr.Z   Animal QTLdb    Production_QTL  33954873      34023581...
QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234;
trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass";
MO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian";
Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52";
Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01
                '''
                str(example)
                # make dictionary of attributes
                # keys are:
                # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,FlankMarkers,
                # VTO_name,Map_Type,Significance,P-value,Model,
                # Test_Base,Variance, Bayes-value,PTO_name,gene_IDsrc,peak_cM,
                # CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect,
                # Dominance_Effect,Likelihood_Ratio,LS-means,Breed,
                # trait (duplicate with Name),Variance,Bayes-value,
                # F-Stat,LOD-score,Additive_Effect,Dominance_Effect,
                # Likelihood_Ratio,LS-means

                # deal with poorly formed attributes
                if re.search(r'"FlankMarkers";', attr):
                    attr = re.sub(r'FlankMarkers;', '', attr)
                attr_items = re.sub(r'"', '', attr).split(";")
                bad_attrs = set()
                for attributes in attr_items:
                    if not re.search(r'=', attributes):
                        # remove this attribute from the list
                        bad_attrs.add(attributes)

                attr_set = set(attr_items) - bad_attrs
                attribute_dict = dict(item.split("=") for item in attr_set)

                qtl_num = attribute_dict.get('QTL_ID')
                if self.test_mode and int(qtl_num) not in self.test_ids:
                    continue
                # make association between QTL and trait based on taxon

                qtl_id = common_name + 'QTL:' + str(qtl_num)
                model.addIndividualToGraph(qtl_id, None, self.globaltt['QTL'])
                geno.addTaxon(taxon_curie, qtl_id)

                #
                trait_id = 'AQTLTrait:' + attribute_dict.get('trait_ID')

                # if pub is in attributes, add it to the association
                pub_id = None
                if 'PUBMED_ID' in attribute_dict.keys():
                    pub_id = attribute_dict.get('PUBMED_ID')
                    if re.match(r'ISU.*', pub_id):
                        pub_id = 'AQTLPub:' + pub_id.strip()
                        reference = Reference(graph, pub_id)
                    else:
                        pub_id = 'PMID:' + pub_id.strip()
                        reference = Reference(
                            graph, pub_id, self.globaltt['journal article'])
                    reference.addRefToGraph()

                # Add QTL to graph
                assoc = G2PAssoc(
                    graph, self.name, qtl_id, trait_id,
                    self.globaltt['is marker for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)
                if 'P-value' in attribute_dict.keys():
                    scr = re.sub(r'<', '', attribute_dict.get('P-value'))
                    if ',' in scr:
                        scr = re.sub(r',', '.', scr)
                    if scr.isnumeric():
                        score = float(scr)
                        assoc.set_score(score)

                assoc.add_association_to_graph()
                # TODO make association to breed
                # (which means making QTL feature in Breed background)

                # get location of QTL
                chromosome = re.sub(r'Chr\.', '', chromosome)
                chrom_id = makeChromID(chromosome, taxon_curie, 'CHR')

                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                qtl_feature = Feature(graph, qtl_id, None, self.globaltt['QTL'])
                if start_bp == '':
                    start_bp = None
                qtl_feature.addFeatureStartLocation(
                    start_bp, chrom_in_build_id, strand,
                    [self.globaltt['FuzzyPosition']])
                if stop_bp == '':
                    stop_bp = None
                qtl_feature.addFeatureEndLocation(
                    stop_bp, chrom_in_build_id, strand,
                    [self.globaltt['FuzzyPosition']])
                qtl_feature.addTaxonToFeature(taxon_curie)
                qtl_feature.addFeatureToGraph()

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break

        # LOG.warning("Bad attribute flags in this file")  # what does this even mean??
        LOG.info("Done with QTL genomic mappings for %s", taxon_curie)
Пример #30
0
    def add_orthologs_by_gene_group(self, graph, gene_ids):
        """
        This will get orthologies between human and other vertebrate genomes
        based on the gene_group annotation pipeline from NCBI.
        More information 9can be learned here:
        http://www.ncbi.nlm.nih.gov/news/03-13-2014-gene-provides-orthologs-regions/
        The method for associations is described in
        [PMCID:3882889](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3882889/)
        == [PMID:24063302](http://www.ncbi.nlm.nih.gov/pubmed/24063302/).
        Because these are only between human and vertebrate genomes,
        they will certainly miss out on very distant orthologies,
        and should not be considered complete.

        We do not run this within the NCBI parser itself;
        rather it is a convenience function for others parsers to call.

        :param graph:
        :param gene_ids:  Gene ids to fetch the orthology
        :return:

        """
        src_key = 'gene_group'
        LOG.info("getting gene groups")
        src_file = '/'.join((self.rawdir, self.files[src_key]['file']))
        found_counter = 0
        # because many of the orthologous groups are grouped by human gene,
        # we need to do this by generating two-way hash

        # group_id => orthologs
        # ortholog id => group
        # this will be the fastest approach, though not memory-efficient.
        geno = Genotype(graph)
        model = Model(graph)
        group_to_orthology = {}
        gene_to_group = {}
        gene_to_taxon = {}
        col = self.files[src_key]['columns']

        with gzip.open(src_file, 'rb') as tsv:
            row = tsv.readline().decode().strip().split('\t')
            row[0] = row[0][1:]  # strip octothorp
            if not self.check_fileheader(col, row):
                pass
            for row in tsv:
                row = row.decode().strip().split('\t')
                tax_a = row[col.index('tax_id')]
                gene_a = row[col.index('GeneID')]
                rel = row[col.index('relationship')]
                tax_b = row[col.index('Other_tax_id')]
                gene_b = row[col.index('Other_GeneID')]

                if rel != 'Ortholog':
                    continue

                if gene_a not in group_to_orthology:
                    group_to_orthology[gene_a] = set()
                group_to_orthology[gene_a].add(gene_b)

                if gene_b not in gene_to_group:
                    gene_to_group[gene_b] = set()
                gene_to_group[gene_b].add(gene_a)

                gene_to_taxon[gene_a] = tax_a
                gene_to_taxon[gene_b] = tax_b

                # also add the group lead as a member of the group
                group_to_orthology[gene_a].add(gene_a)

            # end loop through gene_group file
        LOG.debug("Finished hashing gene groups")
        LOG.debug("Making orthology associations")
        for gid in gene_ids:
            gene_num = re.sub(r'NCBIGene:', '', gid)
            group_nums = gene_to_group.get(gene_num)
            if group_nums is not None:
                for group_num in group_nums:
                    orthologs = group_to_orthology.get(group_num)
                    if orthologs is not None:
                        for orth in orthologs:
                            oid = 'NCBIGene:' + str(orth)
                            model.addClassToGraph(oid, None, self.globaltt['gene'])
                            otaxid = 'NCBITaxon:' + str(gene_to_taxon[orth])
                            geno.addTaxon(otaxid, oid)
                            assoc = OrthologyAssoc(graph, self.name, gid, oid)
                            assoc.add_source('PMID:24063302')
                            assoc.add_association_to_graph()
                            # todo get gene label for orthologs -
                            # this could get expensive
                            found_counter += 1

            # finish loop through annotated genes
        LOG.info(
            "Made %d orthology relationships for %d genes",
            found_counter, len(gene_ids))
Пример #31
0
    def _process_phenotype_data(self, limit):
        """
        NOTE: If a Strain carries more than one mutation,
        then each Mutation description,
        i.e., the set: (
            Mutation Type - Chromosome - Gene Symbol -
            Gene Name - Allele Symbol - Allele Name)
        will require a separate line.

        Note that MMRRC curates phenotypes to alleles,
        even though they distribute only one file with the
        phenotypes appearing to be associated with a strain.

        So, here we process the allele-to-phenotype relationships separately
        from the strain-to-allele relationships.

        :param limit:
        :return:

        """

        src_key = 'catalog'
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        fname = '/'.join((self.rawdir, self.files[src_key]['file']))

        self.strain_hash = {}
        self.id_label_hash = {}
        genes_with_no_ids = set()
        stem_cell_class = self.globaltt['stem cell']
        mouse_taxon = self.globaltt['Mus musculus']
        geno = Genotype(graph)
        with open(fname, 'r', encoding="utf8") as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            # First line is header not date/version info. This changed recently,
            # apparently as of Sep 2019. Also, 3rd line is no longer blank.
            row = [x.strip() for x in next(reader)]  # messy messy
            col = self.files['catalog']['columns']
            strain_missing_allele = []  # to count the ones w/insufficent info
            if not self.check_fileheader(col, row):
                pass

            for row in reader:
                strain_id = row[col.index('STRAIN/STOCK_ID')].strip()
                strain_label = row[col.index('STRAIN/STOCK_DESIGNATION')]
                # strain_type_symbol = row[col.index('STRAIN_TYPE')]
                strain_state = row[col.index('STATE')]
                mgi_allele_id = row[col.index(
                    'MGI_ALLELE_ACCESSION_ID')].strip()
                mgi_allele_symbol = row[col.index('ALLELE_SYMBOL')]
                # mgi_allele_name = row[col.index('ALLELE_NAME')]
                # mutation_type = row[col.index('MUTATION_TYPE')]
                # chrom = row[col.index('CHROMOSOME')]
                mgi_gene_id = row[col.index('MGI_GENE_ACCESSION_ID')].strip()
                mgi_gene_symbol = row[col.index('GENE_SYMBOL')].strip()
                mgi_gene_name = row[col.index('GENE_NAME')]
                # sds_url = row[col.index('SDS_URL')]
                # accepted_date = row[col.index('ACCEPTED_DATE')]
                mpt_ids = row[col.index('MPT_IDS')].strip()
                pubmed_nums = row[col.index('PUBMED_IDS')].strip()
                research_areas = row[col.index('RESEARCH_AREAS')].strip()

                if self.test_mode and (strain_id not in self.test_ids) \
                        or mgi_gene_name == 'withdrawn':
                    continue

                # strip off stuff after the dash -
                # is the holding center important?
                # MMRRC:00001-UNC --> MMRRC:00001
                strain_id = re.sub(r'-\w+$', '', strain_id)

                self.id_label_hash[strain_id] = strain_label

                # get the variant or gene to save for later building of
                # the genotype
                if strain_id not in self.strain_hash:
                    self.strain_hash[strain_id] = {
                        'variants': set(),
                        'genes': set()
                    }

                # flag bad ones
                if mgi_allele_id[:4] != 'MGI:' and mgi_allele_id != '':
                    LOG.error("Erroneous MGI allele id: %s", mgi_allele_id)
                    if mgi_allele_id[:3] == 'MG:':
                        mgi_allele_id = 'MGI:' + mgi_allele_id[3:]
                    else:
                        mgi_allele_id = ''

                if mgi_allele_id != '':
                    self.strain_hash[strain_id]['variants'].add(mgi_allele_id)
                    self.id_label_hash[mgi_allele_id] = mgi_allele_symbol

                    # use the following if needing to add the sequence alteration types
                    # var_type = self.localtt[mutation_type]
                    # make a sequence alteration for this variant locus,
                    # and link the variation type to it
                    # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA'
                    # if self.nobnodes:
                    #     sa_id = ':'+sa_id
                    # gu.addIndividualToGraph(g, sa_id, None, var_type)
                    # geno.addSequenceAlterationToVariantLocus(sa_id, mgi_allele_id)

                # scrub out any spaces, fix known issues
                mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id)
                if mgi_gene_id == 'NULL':
                    mgi_gene_id = ''
                elif mgi_gene_id[:7] == 'GeneID:':
                    mgi_gene_id = 'NCBIGene:' + mgi_gene_id[7:]

                if mgi_gene_id != '':
                    try:
                        [curie, localid] = mgi_gene_id.split(':')
                    except ValueError as verror:
                        LOG.warning(
                            "Problem parsing mgi_gene_id %s from file %s: %s",
                            mgi_gene_id, fname, verror)
                    if curie not in ['MGI', 'NCBIGene']:
                        LOG.info("MGI Gene id not recognized: %s", mgi_gene_id)
                    self.strain_hash[strain_id]['genes'].add(mgi_gene_id)
                    self.id_label_hash[mgi_gene_id] = mgi_gene_symbol

                # catch some errors - too many. report summary at the end
                # some things have gene labels, but no identifiers - report
                if mgi_gene_symbol != '' and mgi_gene_id == '':
                    # LOG.error(
                    #    "Gene label with no MGI identifier for strain %s: %s",
                    #    strain_id, mgi_gene_symbol)
                    genes_with_no_ids.add(mgi_gene_symbol)
                    # make a temp id for genes that aren't identified ... err wow.
                    # tmp_gene_id = '_' + mgi_gene_symbol
                    # self.id_label_hash[tmp_gene_id.strip()] = mgi_gene_symbol
                    # self.strain_hash[strain_id]['genes'].add(tmp_gene_id)

                # split apart the mp ids
                # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ...
                # mpt_ids are a comma delimited list
                # labels with MP terms following in brackets
                phenotype_ids = []
                if mpt_ids != '':
                    for lb_mp in mpt_ids.split(r','):
                        lb_mp = lb_mp.strip()
                        if lb_mp[-1:] == ']' and lb_mp[-12:-8] == '[MP:':
                            phenotype_ids.append(lb_mp[-11:-2])

                # pubmed ids are space delimited
                pubmed_ids = []
                if pubmed_nums != '':
                    for pm_num in re.split(r'\s+', pubmed_nums):
                        pmid = 'PMID:' + pm_num.strip()
                        pubmed_ids.append(pmid)
                        ref = Reference(graph, pmid,
                                        self.globaltt['journal article'])
                        ref.addRefToGraph()

                # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001
                # is a good example of 4 genotype parts

                model.addClassToGraph(mouse_taxon, None)
                if research_areas == '':
                    research_areas = None
                else:
                    research_areas = 'Research Areas: ' + research_areas
                strain_type = mouse_taxon
                if strain_state == 'ES':
                    strain_type = stem_cell_class
                model.addIndividualToGraph(  # an inst of mouse??
                    strain_id, strain_label, strain_type, research_areas)
                model.makeLeader(strain_id)

                # phenotypes are associated with the alleles
                for pid in phenotype_ids:
                    # assume the phenotype label is in some ontology
                    model.addClassToGraph(pid, None)
                    if mgi_allele_id is not None and mgi_allele_id != '':
                        assoc = G2PAssoc(graph, self.name, mgi_allele_id, pid,
                                         self.globaltt['has phenotype'])
                        for p in pubmed_ids:
                            assoc.add_source(p)
                        assoc.add_association_to_graph()
                    else:
                        # too chatty here. report aggregate
                        # LOG.info("Phenotypes and no allele for %s", strain_id)
                        strain_missing_allele.append(strain_id)

                if not self.test_mode and (limit is not None
                                           and reader.line_num > limit):
                    break

            # report misses
            if strain_missing_allele:
                LOG.info("Phenotypes and no allele for %i strains",
                         len(strain_missing_allele))

            # now that we've collected all of the variant information, build it
            # we don't know their zygosities
            for s in self.strain_hash:
                h = self.strain_hash.get(s)
                variants = h['variants']
                genes = h['genes']
                vl_set = set()
                # make variant loci for each gene
                if variants:
                    for var in variants:
                        vl_id = var.strip()
                        vl_symbol = self.id_label_hash[vl_id]
                        geno.addAllele(vl_id, vl_symbol,
                                       self.globaltt['variant_locus'])
                        vl_set.add(vl_id)
                        if len(variants) == 1 and len(genes) == 1:
                            for gene in genes:
                                geno.addAlleleOfGene(vl_id, gene)
                        else:
                            geno.addAllele(vl_id, vl_symbol)
                else:  # len(vars) == 0
                    # it's just anonymous variants in some gene
                    for gene in genes:
                        vl_id = '_:' + re.sub(r':', '', gene) + '-VL'
                        vl_symbol = self.id_label_hash[gene] + '<?>'
                        self.id_label_hash[vl_id] = vl_symbol
                        geno.addAllele(vl_id, vl_symbol,
                                       self.globaltt['variant_locus'])
                        geno.addGene(gene, self.id_label_hash[gene])
                        geno.addAlleleOfGene(vl_id, gene)
                        vl_set.add(vl_id)

                # make the vslcs
                vl_list = sorted(vl_set)
                vslc_list = []
                for vl in vl_list:
                    # for unknown zygosity
                    vslc_id = re.sub(r'^_', '', vl) + 'U'
                    vslc_id = re.sub(r':', '', vslc_id)
                    vslc_id = '_:' + vslc_id
                    vslc_label = self.id_label_hash[vl] + '/?'
                    self.id_label_hash[vslc_id] = vslc_label
                    vslc_list.append(vslc_id)
                    geno.addPartsToVSLC(vslc_id, vl, None,
                                        self.globaltt['indeterminate'],
                                        self.globaltt['has_variant_part'],
                                        None)
                    model.addIndividualToGraph(
                        vslc_id, vslc_label,
                        self.globaltt['variant single locus complement'])
                if vslc_list:
                    if len(vslc_list) > 1:
                        gvc_id = '-'.join(vslc_list)
                        gvc_id = re.sub(r'_|:', '', gvc_id)
                        gvc_id = '_:' + gvc_id
                        gvc_label = '; '.join(self.id_label_hash[v]
                                              for v in vslc_list)
                        model.addIndividualToGraph(
                            gvc_id, gvc_label,
                            self.globaltt['genomic_variation_complement'])
                        for vslc_id in vslc_list:
                            geno.addVSLCtoParent(vslc_id, gvc_id)
                    else:
                        # the GVC == VSLC, so don't have to make an extra piece
                        gvc_id = vslc_list.pop()
                        gvc_label = self.id_label_hash[gvc_id]

                    genotype_label = gvc_label + ' [n.s.]'
                    bkgd_id = re.sub(
                        r':', '', '-'.join(
                            (self.globaltt['unspecified_genomic_background'],
                             s)))
                    genotype_id = '-'.join((gvc_id, bkgd_id))
                    bkgd_id = '_:' + bkgd_id
                    geno.addTaxon(mouse_taxon, bkgd_id)
                    geno.addGenomicBackground(
                        bkgd_id, 'unspecified (' + s + ')',
                        self.globaltt['unspecified_genomic_background'],
                        "A placeholder for the unspecified genetic background for "
                        + s)
                    geno.addGenomicBackgroundToGenotype(
                        bkgd_id, genotype_id,
                        self.globaltt['unspecified_genomic_background'])
                    geno.addParts(gvc_id, genotype_id,
                                  self.globaltt['has_variant_part'])
                    geno.addGenotype(genotype_id, genotype_label)
                    graph.addTriple(s, self.globaltt['has_genotype'],
                                    genotype_id)
                else:
                    # LOG.debug(
                    #   "Strain %s is not making a proper genotype.", s)
                    pass

            LOG.warning(
                "The following gene symbols did not list identifiers: %s",
                str(sorted(list(genes_with_no_ids))))
            LOG.error('%i symbols given are missing their gene identifiers',
                      len(genes_with_no_ids))

        return
Пример #32
0
    def _process_QTLs_genomic_location(self, raw, taxon_id, build_id, build_label, limit=None):
        """
        This method

        Triples created:

        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        gu = GraphUtils(curie_map.get())
        line_counter = 0
        geno = Genotype(g)
        genome_id = geno.makeGenomeID(taxon_id)  # assume that chrs get added to the genome elsewhere

        eco_id = "ECO:0000061"  # Quantitative Trait Analysis Evidence

        with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            for row in reader:
                line_counter += 1
                if re.match('^#', ' '.join(row)):
                    continue

                (chromosome, qtl_source, qtl_type, start_bp, stop_bp, frame, strand, score, attr) = row

                # Chr.Z   Animal QTLdb    Production_QTL  33954873        34023581        .       .       .
                # QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234;
                # trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass";
                # CMO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian";
                # Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52";
                # Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01"

                # make dictionary of attributes
                # keys are:
                # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,
                # FlankMarkers,VTO_name,Map_Type,Significance,P-value,Model,Test_Base,Variance,
                # Bayes-value,PTO_name,gene_IDsrc,peak_cM,CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect,
                # Dominance_Effect,Likelihood_Ratio,LS-means,Breed,
                # trait (duplicate with Name),Variance,Bayes-value,
                # F-Stat,LOD-score,Additive_Effect,Dominance_Effect,Likelihood_Ratio,LS-means

                # deal with poorly formed attributes
                if re.search('"FlankMarkers";', attr):
                    attr = re.sub('"FlankMarkers";', '', attr)
                attr_items = re.sub('"', '', attr).split(";")
                bad_attr_flag = False
                for a in attr_items:
                    if not re.search('=', a):
                        bad_attr_flag = True
                if bad_attr_flag:
                    logger.error("Poorly formed data on line %d:\n %s", line_counter, '\t'.join(row))
                    continue
                attribute_dict = dict(item.split("=") for item in re.sub('"', '', attr).split(";"))

                qtl_num = attribute_dict.get('QTL_ID')
                if self.testMode and int(qtl_num) not in self.test_ids:
                    continue

                # make association between QTL and trait
                qtl_id = 'AQTL:' + str(qtl_num)
                gu.addIndividualToGraph(g, qtl_id, None, geno.genoparts['QTL'])
                geno.addTaxon(taxon_id, qtl_id)

                trait_id = 'AQTLTrait:'+attribute_dict.get('trait_ID')

                # if pub is in attributes, add it to the association
                pub_id = None
                if 'PUBMED_ID' in attribute_dict.keys():
                    pub_id = attribute_dict.get('PUBMED_ID')
                    if re.match('ISU.*', pub_id):
                        pub_id = 'AQTLPub:' + pub_id.strip()
                        p = Reference(pub_id)
                    else:
                        pub_id = 'PMID:' + pub_id.strip()
                        p = Reference(pub_id, Reference.ref_types['journal_article'])
                    p.addRefToGraph(g)

                # Add QTL to graph
                assoc = G2PAssoc(self.name, qtl_id, trait_id, gu.object_properties['is_marker_for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)
                if 'P-value' in attribute_dict.keys():
                    score = float(re.sub('<', '', attribute_dict.get('P-value')))
                    assoc.set_score(score)

                assoc.add_association_to_graph(g)
                # TODO make association to breed (which means making QTL feature in Breed background)

                # get location of QTL
                chromosome = re.sub('Chr\.', '', chromosome)
                chrom_id = makeChromID(chromosome, taxon_id, 'CHR')

                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(chromosome, build_id, build_label, chrom_id)
                qtl_feature = Feature(qtl_id, None, geno.genoparts['QTL'])
                if start_bp == '':
                    start_bp = None
                qtl_feature.addFeatureStartLocation(start_bp, chrom_in_build_id, strand,
                                                    [Feature.types['FuzzyPosition']])
                if stop_bp == '':
                    stop_bp = None
                qtl_feature.addFeatureEndLocation(stop_bp, chrom_in_build_id, strand,
                                                  [Feature.types['FuzzyPosition']])
                qtl_feature.addTaxonToFeature(g, taxon_id)
                qtl_feature.addFeatureToGraph(g)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

        logger.info("Done with QTL genomic mappings for %s", taxon_id)
        return
Пример #33
0
    def _process_phenotype_data(self, limit):
        """
        NOTE: If a Strain carries more than one mutation,
        then each Mutation description,
        i.e., the set: (
            Mutation Type - Chromosome - Gene Symbol -
            Gene Name - Allele Symbol - Allele Name)
        will require a separate line.

        Note that MMRRC curates phenotypes to alleles,
        even though they distribute only one file with the
        phenotypes appearing to be associated with a strain.

        So, here we process the allele-to-phenotype relationships separately
        from the strain-to-allele relationships.

        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        gu = GraphUtils(curie_map.get())
        fname = '/'.join((self.rawdir, self.files['catalog']['file']))

        self.strain_hash = {}
        self.id_label_hash = {}
        genes_with_no_ids = set()
        stem_cell_class = 'CL:0000034'
        mouse_taxon = 'NCBITaxon:10090'
        geno = Genotype(g)
        with open(fname, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            for row in filereader:
                line_counter += 1
                # skip the first 3 lines which are header, etc.
                if line_counter < 4:
                    continue

                (strain_id, strain_label, strain_type_symbol, strain_state,
                 mgi_allele_id, mgi_allele_symbol, mgi_allele_name,
                 mutation_type, chrom, mgi_gene_id, mgi_gene_symbol,
                 mgi_gene_name, sds_url, accepted_date, mp_ids, pubmed_nums,
                 research_areas) = row

                if self.testMode and (strain_id not in self.test_ids):
                    continue

                # strip off stuff after the dash -
                # is the holding center important?
                # MMRRC:00001-UNC --> MMRRC:00001
                strain_id = re.sub(r'-\w+$', '', strain_id)

                self.id_label_hash[strain_id] = strain_label

                # get the variant or gene to save for later building of
                # the genotype
                if strain_id not in self.strain_hash:
                    self.strain_hash[strain_id] = {'variants': set(),
                                                   'genes': set()}

                # clean up the bad one
                if mgi_allele_id == 'multiple mutation':
                    logger.error("Erroneous gene id: %s", mgi_allele_id)
                    mgi_allele_id = ''

                if mgi_allele_id != '':
                    self.strain_hash[strain_id]['variants'].add(mgi_allele_id)
                    self.id_label_hash[mgi_allele_id] = mgi_allele_symbol

                    # use the following if needing to add the
                    # sequence alteration types
                    # var_type =
                    #   self._get_variant_type_from_abbrev(mutation_type)
                    # make a sequence alteration for this variant locus,
                    # and link the variation type to it
                    # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA'
                    # if self.nobnodes:
                    #     sa_id = ':'+sa_id
                    # gu.addIndividualToGraph(g, sa_id, None, var_type)
                    # geno.addSequenceAlterationToVariantLocus(sa_id,
                    #                                          mgi_allele_id)

                # scrub out any spaces
                mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id)
                if mgi_gene_id.strip() != '':
                    if re.match(r'Gene\s*ID:', mgi_gene_id, re.I):
                        mgi_gene_id = re.sub(r'Gene\s*ID:\s*', 'NCBIGene:',
                                             mgi_gene_id)
                    elif not re.match(r'MGI', mgi_gene_id):
                        logger.info("Gene id not recognized: %s", mgi_gene_id)
                        if re.match(r'\d+$', mgi_gene_id):
                            # assume that if it's all numbers, then it's MGI
                            mgi_gene_id = 'MGI:'+str(mgi_gene_id)
                            logger.info("Assuming numerics are MGI.")
                    self.strain_hash[strain_id]['genes'].add(mgi_gene_id)
                    self.id_label_hash[mgi_gene_id] = mgi_gene_symbol

                # catch some errors -
                # some things have gene labels, but no identifiers - report
                if mgi_gene_symbol.strip() != '' and mgi_gene_id == '':
                    logger.error(
                        "Gene label with no identifier for strain %s: %s",
                        strain_id, mgi_gene_symbol)
                    genes_with_no_ids.add(mgi_gene_symbol.strip())
                    # make a temp id for genes that aren't identified
                    # tmp_gene_id = '_'+mgi_gene_symbol
                    # self.id_label_hash[tmp_gene_id] = mgi_gene_symbol
                    # self.strain_hash[strain_id]['genes'].add(tmp_gene_id)

                # split apart the mp ids
                # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ...
                # mp_ids are now a comma delimited list
                # with MP terms in brackets
                phenotype_ids = []
                if mp_ids != '':
                    for i in re.split(r',', mp_ids):
                        i = i.strip()
                        mps = re.search(r'\[(.*)\]', i)
                        if mps is not None:
                            mp_id = mps.group(1).strip()
                            phenotype_ids.append(mp_id)

                # pubmed ids are space delimited
                pubmed_ids = []
                if pubmed_nums.strip() != '':
                    for i in re.split(r'\s+', pubmed_nums):
                        pmid = 'PMID:'+i.strip()
                        pubmed_ids.append(pmid)
                        r = Reference(pmid,
                                      Reference.ref_types['journal_article'])
                        r.addRefToGraph(g)

                # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001
                # is a good example of 4 genotype parts

                gu.addClassToGraph(g, mouse_taxon, None)
                if research_areas.strip() == '':
                    research_areas = None
                else:
                    research_areas = 'Research Areas: '+research_areas
                strain_type = mouse_taxon
                if strain_state == 'ES':
                    strain_type = stem_cell_class
                gu.addIndividualToGraph(
                    g, strain_id, strain_label, strain_type,
                    research_areas)  # an inst of mouse??
                gu.makeLeader(g, strain_id)

                # phenotypes are associated with the alleles
                for pid in phenotype_ids:
                    # assume the phenotype label is in the ontology
                    gu.addClassToGraph(g, pid, None)
                    if mgi_allele_id is not None and mgi_allele_id != '':
                        assoc = G2PAssoc(self.name, mgi_allele_id, pid,
                                         gu.object_properties['has_phenotype'])
                        for p in pubmed_ids:
                            assoc.add_source(p)
                        assoc.add_association_to_graph(g)
                    else:
                        logger.info("Phenotypes and no allele for %s",
                                    strain_id)

                if not self.testMode and (
                        limit is not None and line_counter > limit):
                    break

            # now that we've collected all of the variant information, build it
            # we don't know their zygosities
            for s in self.strain_hash:
                h = self.strain_hash.get(s)
                variants = h['variants']
                genes = h['genes']
                vl_set = set()
                # make variant loci for each gene
                if len(variants) > 0:
                    for v in variants:
                        vl_id = v
                        vl_symbol = self.id_label_hash[vl_id]
                        geno.addAllele(vl_id, vl_symbol,
                                       geno.genoparts['variant_locus'])
                        vl_set.add(vl_id)
                        if len(variants) == 1 and len(genes) == 1:
                            for gene in genes:
                                geno.addAlleleOfGene(vl_id, gene)
                        else:
                            geno.addAllele(vl_id, vl_symbol)
                else:  # len(vars) == 0
                    # it's just anonymous variants in some gene
                    for gene in genes:
                        vl_id = '_'+gene+'-VL'
                        vl_id = re.sub(r':', '', vl_id)
                        if self.nobnodes:
                            vl_id = ':'+vl_id
                        vl_symbol = self.id_label_hash[gene]+'<?>'
                        self.id_label_hash[vl_id] = vl_symbol
                        geno.addAllele(vl_id, vl_symbol,
                                       geno.genoparts['variant_locus'])
                        geno.addGene(gene, self.id_label_hash[gene])
                        geno.addAlleleOfGene(vl_id, gene)
                        vl_set.add(vl_id)

                # make the vslcs
                vl_list = sorted(vl_set)
                vslc_list = []
                for vl in vl_list:
                    # for unknown zygosity
                    vslc_id = '_'+re.sub(r'^_', '', vl)+'U'
                    vslc_id = re.sub(r':', '', vslc_id)
                    if self.nobnodes:
                        vslc_id = ':' + vslc_id
                    vslc_label = self.id_label_hash[vl] + '/?'
                    self.id_label_hash[vslc_id] = vslc_label
                    vslc_list.append(vslc_id)
                    geno.addPartsToVSLC(
                        vslc_id, vl, None, geno.zygosity['indeterminate'],
                        geno.object_properties['has_alternate_part'], None)
                    gu.addIndividualToGraph(
                        g, vslc_id, vslc_label,
                        geno.genoparts['variant_single_locus_complement'])
                if len(vslc_list) > 0:
                    if len(vslc_list) > 1:
                        gvc_id = '-'.join(vslc_list)
                        gvc_id = re.sub(r':', '', gvc_id)
                        if self.nobnodes:
                            gvc_id = ':'+gvc_id
                        gvc_label = \
                            '; '.join(self.id_label_hash[v] for v in vslc_list)
                        gu.addIndividualToGraph(
                            g, gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])
                        for vslc_id in vslc_list:
                            geno.addVSLCtoParent(vslc_id, gvc_id)
                    else:
                        # the GVC == VSLC, so don't have to make an extra piece
                        gvc_id = vslc_list.pop()
                        gvc_label = self.id_label_hash[gvc_id]

                    genotype_label = gvc_label + ' [n.s.]'
                    bkgd_id = \
                        '_' + re.sub(r':', '', '-'.join(
                            (geno.genoparts['unspecified_genomic_background'],
                             s)))
                    genotype_id = '-'.join((gvc_id, bkgd_id))
                    if self.nobnodes:
                        bkgd_id = ':'+bkgd_id
                    geno.addTaxon(mouse_taxon, bkgd_id)
                    geno.addGenomicBackground(
                        bkgd_id, 'unspecified ('+s+')',
                        geno.genoparts['unspecified_genomic_background'],
                        "A placeholder for the " +
                        "unspecified genetic background for "+s)
                    geno.addGenomicBackgroundToGenotype(
                        bkgd_id, genotype_id,
                        geno.genoparts['unspecified_genomic_background'])
                    geno.addParts(
                        gvc_id, genotype_id,
                        geno.object_properties['has_alternate_part'])
                    geno.addGenotype(genotype_id, genotype_label)
                    gu.addTriple(
                        g, s, geno.object_properties['has_genotype'],
                        genotype_id)
                else:
                    # logger.debug(
                    #   "Strain %s is not making a proper genotype.", s)
                    pass

            gu.loadProperties(
                g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
            gu.loadProperties(
                g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
            gu.loadProperties(
                g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
            gu.loadAllProperties(g)

            logger.warning(
                "The following gene symbols did not list identifiers: %s",
                str(sorted(list(genes_with_no_ids))))

        return
Пример #34
0
    def _process_genes(self, limit=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        geno = Genotype(graph)
        model = Model(graph)

        raw = '/'.join((self.rawdir, self.files['genes']['file']))
        col = self.files['genes']['columns']
        LOG.info("Processing HGNC genes")

        chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]')
        band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)')

        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')

            row = next(filereader)
            if not self.check_fileheader(col, row):
                pass

            for row in filereader:
                # To generate:
                # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' |
                # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g"

                hgnc_id = row[col.index('hgnc_id')].strip()
                symbol = row[col.index('symbol')].strip()
                name = row[col.index('name')].strip()
                # locus_group = row[col.index('locus_group')]
                locus_type = row[col.index('locus_type')].strip()
                # status = row[col.index('status')]
                location = row[col.index('location')].strip()
                # location_sortable = row[col.index('location_sortable')]
                # alias_symbol = row[col.index('alias_symbol')]
                # alias_name = row[col.index('alias_name')]
                # prev_symbol = row[col.index('prev_symbol')]
                # prev_name = row[col.index('prev_name')]
                # gene_family = row[col.index('gene_family')]
                # gene_family_id = row[col.index('gene_family_id')]
                # date_approved_reserved = row[col.index('date_approved_reserved')]
                # date_symbol_changed = row[col.index('date_symbol_changed')]
                # date_name_changed = row[col.index('date_name_changed')]
                # date_modified = row[col.index('date_modified')]
                entrez_id = row[col.index('entrez_id')].strip()
                ensembl_gene_id = row[col.index('ensembl_gene_id')].strip()
                # vega_id = row[col.index('vega_id')]
                # ucsc_id = row[col.index('ucsc_id')]
                # ena = row[col.index('ena')]
                # refseq_accession = row[col.index('refseq_accession')]
                # ccds_id = row[col.index('ccds_id')]
                # uniprot_ids = row[col.index('uniprot_ids')]
                pubmed_ids = row[col.index(
                    'pubmed_id')].strip()  # pipe separated!
                # mgd_id = row[col.index('mgd_id')]
                # rgd_id = row[col.index('rgd_id')]
                # lsdb = row[col.index('lsdb')]
                # cosmic = row[col.index('cosmic')]
                omim_ids = row[col.index('omim_id')].strip()  # pipe separated!
                # mirbase = row[col.index('mirbase')]
                # homeodb = row[col.index('homeodb')]
                # snornabase = row[col.index('snornabase')]
                # bioparadigms_slc = row[col.index('bioparadigms_slc')]
                # orphanet = row[col.index('orphanet')]
                # pseudogene.org = row[col.index('pseudogene.org')]
                # horde_id = row[col.index('horde_id')]
                # merops = row[col.index('merops')]
                # imgt = row[col.index('imgt')]
                # iuphar = row[col.index('iuphar')]
                # kznf_gene_catalog = row[col.index('kznf_gene_catalog')]
                # mamit_trnadb = row[col.index('mamit-trnadb')]
                # cd = row[col.index('cd')]
                # lncrnadb = row[col.index('lncrnadb')]
                # enzyme_id = row[col.index('enzyme_id')]
                # intermediate_filament_db = row[col.index('intermediate_filament_db')]
                # rna_central_ids = row[col.index('rna_central_ids')]
                # lncipedia = row[col.index('lncipedia')]
                # gtrnadb = row[col.index('gtrnadb')]

                if self.test_mode and entrez_id != '' and \
                        entrez_id not in self.gene_ids:
                    continue

                if name == '':
                    name = None

                if locus_type == 'withdrawn':
                    model.addDeprecatedClass(hgnc_id)
                elif symbol[
                        -1] == '@':  # 10)  region (HOX), RNA cluster, gene (PCDH)
                    continue

                else:
                    gene_type_id = self.resolve(locus_type, mandatory=False)
                    if gene_type_id != locus_type:
                        model.addClassToGraph(hgnc_id, symbol, gene_type_id,
                                              name)
                    model.makeLeader(hgnc_id)

                if entrez_id != '':
                    model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id)

                if ensembl_gene_id != '':
                    model.addEquivalentClass(hgnc_id,
                                             'ENSEMBL:' + ensembl_gene_id)

                for omim_id in omim_ids.split('|'):
                    if omim_id in self.omim_replaced:
                        repl = self.omim_replaced[omim_id]
                        LOG.warning('%s is replaced with %s', omim_id, repl)
                        for omim in repl:
                            if self.omim_type[omim] == self.globaltt['gene']:
                                omim_id = omim

                    if omim_id in self.omim_type and \
                            self.omim_type[omim_id] == self.globaltt['gene']:
                        model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id)

                geno.addTaxon(self.hs_txid, hgnc_id)

                # add pubs as "is about"
                for pubmed_id in pubmed_ids.split('|'):
                    graph.addTriple('PMID:' + pubmed_id,
                                    self.globaltt['is_about'], hgnc_id)

                # add chr location
                # sometimes two are listed, like: 10p11.2 or 17q25
                # -- there are only 2 of these FRA10A and MPFD
                # sometimes listed like "1 not on reference assembly"
                # sometimes listed like 10q24.1-q24.3
                # sometimes like 11q11 alternate reference locus
                band = chrom = None
                chr_match = chr_pattern.match(location)
                if chr_match is not None and chr_match.groups():
                    chrom = chr_match.group(1)
                    chrom_id = makeChromID(chrom, self.hs_txid, 'CHR')
                    band_match = band_pattern.search(location)
                    feat = Feature(graph, hgnc_id, None, None)
                    if band_match is not None and band_match.groups():
                        band = band_match.group(1)
                        band = chrom + band
                        # add the chr band as the parent to this gene
                        # as a feature but assume that the band is created
                        # as a class with properties elsewhere in Monochrom
                        band_id = makeChromID(band, self.hs_txid, 'CHR')
                        model.addClassToGraph(band_id, None)
                        feat.addSubsequenceOfFeature(band_id)
                    else:
                        model.addClassToGraph(chrom_id, None)
                        feat.addSubsequenceOfFeature(chrom_id)

                if not self.test_mode and limit is not None and \
                        filereader.line_num > limit:
                    break
Пример #35
0
    def _get_gene_info(self, limit):
        """
        Currently loops through the gene_info file and creates the genes as classes, typed with SO.  It will add their
        label, any alternate labels as synonyms, alternate ids as equivlaent classes.  HPRDs get added as
        protein products.  The chromosome and chr band get added as blank node regions, and the gene is faldo:located
        on the chr band.
        :param limit:
        :return:
        """
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)

        # not unzipping the file
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene_info']['file']))
        logger.info("FILE: %s", myfile)

        # Add taxa and genome classes for those in our filter
        for tax_num in self.tax_ids:
            tax_id = ':'.join(('NCBITaxon', str(tax_num)))
            geno.addGenome(tax_id, str(tax_num))   # tax label can get added elsewhere
            gu.addClassToGraph(g, tax_id, None)   # label added elsewhere
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue
                (tax_num, gene_num, symbol, locustag,
                 synonyms, xrefs, chr, map_loc, desc,
                 gtype, authority_symbol, name,
                 nomenclature_status, other_designations, modification_date) = line.split('\t')

                ##### set filter=None in init if you don't want to have a filter
                #if self.filter is not None:
                #    if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
                #            or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
                #        continue
                ##### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1

                gene_id = ':'.join(('NCBIGene', gene_num))
                tax_id = ':'.join(('NCBITaxon', tax_num))
                gene_type_id = self._map_type_of_gene(gtype)

                if symbol == 'NEWENTRY':
                    label = None
                else:
                    label = symbol

                # TODO might have to figure out if things aren't genes, and make them individuals
                gu.addClassToGraph(g, gene_id, label, gene_type_id, desc)

                # we have to do special things here for genes, because they're classes not individuals
                # f = Feature(gene_id,label,gene_type_id,desc)

                if name != '-':
                    gu.addSynonym(g, gene_id, name)
                if synonyms.strip() != '-':
                    for s in synonyms.split('|'):
                        gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])
                if other_designations.strip() != '-':
                    for s in other_designations.split('|'):
                        gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])

                # deal with the xrefs
                # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696
                if xrefs.strip() != '-':
                    for r in xrefs.strip().split('|'):
                        fixedr = self._cleanup_id(r)
                        if fixedr is not None and fixedr.strip() != '':
                            if re.match('HPRD', fixedr):
                                # proteins are not == genes.
                                gu.addTriple(g, gene_id, self.properties['has_gene_product'], fixedr)
                            else:
                                # skip some of these for now
                                if fixedr.split(':')[0] not in ['Vega', 'IMGT/GENE-DB']:
                                    gu.addEquivalentClass(g, gene_id, fixedr)

                # edge cases of id | symbol | chr | map_loc:
                # 263     AMD1P2    X|Y  with   Xq28 and Yq12
                # 438     ASMT      X|Y  with   Xp22.3 or Yp11.3    # in PAR
                # 419     ART3      4    with   4q21.1|4p15.1-p14   # no idea why there's two bands listed - possibly 2 assemblies
                # 28227   PPP2R3B   X|Y  Xp22.33; Yp11.3            # in PAR
                # 619538  OMS     10|19|3 10q26.3;19q13.42-q13.43;3p25.3   #this is of "unknown" type == susceptibility
                # 101928066       LOC101928066    1|Un    -         # unlocated scaffold
                # 11435   Chrna1  2       2 C3|2 43.76 cM           # mouse --> 2C3
                # 11548   Adra1b  11      11 B1.1|11 25.81 cM       # mouse --> 11B1.1
                # 11717   Ampd3   7       7 57.85 cM|7 E2-E3        # mouse
                # 14421   B4galnt1        10      10 D3|10 74.5 cM  # mouse
                # 323212  wu:fb92e12      19|20   -                 # fish
                # 323368  ints10  6|18    -                         # fish
                # 323666  wu:fc06e02      11|23   -                 # fish

                # feel that the chr placement can't be trusted in this table when there is > 1 listed
                # with the exception of human X|Y, i will only take those that align to one chr

                # FIXME remove the chr mapping below when we pull in the genomic coords
                if str(chr) != '-' and str(chr) != '':
                    if re.search('\|', str(chr)) and str(chr) not in ['X|Y','X; Y']:
                        # this means that there's uncertainty in the mapping.  skip it
                        # TODO we'll need to figure out how to deal with >1 loc mapping
                        logger.info('%s is non-uniquely mapped to %s.  Skipping for now.', gene_id, str(chr))
                        continue
                        # X|Y	Xp22.33;Yp11.3

                    # if (not re.match('(\d+|(MT)|[XY]|(Un)$',str(chr).strip())):
                    #    print('odd chr=',str(chr))
                    if str(chr) == 'X; Y':
                        chr = 'X|Y'  # rewrite the PAR regions for processing
                    # do this in a loop to allow PAR regions like X|Y
                    for c in re.split('\|',str(chr)) :
                        geno.addChromosomeClass(c, tax_id, None)  # assume that the chromosome label will get added elsewhere
                        mychrom = makeChromID(c, tax_num, 'CHR')
                        mychrom_syn = makeChromLabel(c, tax_num)  # temporarily use the taxnum for the disambiguating label
                        gu.addSynonym(g, mychrom,  mychrom_syn)
                        band_match = re.match('[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc)
                        if band_match is not None and len(band_match.groups()) > 0:
                            # if tax_num != '9606':
                            #     continue
                            # this matches the regular kind of chrs, so make that kind of band
                            # not sure why this matches? chrX|Y or 10090chr12|Un"
                            # TODO we probably need a different regex per organism
                            # the maploc_id already has the numeric chromosome in it, strip it first
                            bid = re.sub('^'+c, '', map_loc)
                            maploc_id = makeChromID(c+bid, tax_num, 'CHR')  # the generic location (no coordinates)
                            # print(map_loc,'-->',bid,'-->',maploc_id)
                            band = Feature(maploc_id, None, None)  # Assume it's type will be added elsewhere
                            band.addFeatureToGraph(g)
                            # add the band as the containing feature
                            gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], maploc_id)
                        else:
                            # TODO handle these cases
                            # examples are: 15q11-q22, Xp21.2-p11.23, 15q22-qter, 10q11.1-q24,
                            ## 12p13.3-p13.2|12p13-p12, 1p13.3|1p21.3-p13.1,  12cen-q21, 22q13.3|22q13.3
                            logger.debug('not regular band pattern for %s: %s', gene_id, map_loc)
                            # add the gene as a subsequence of the chromosome
                            gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], mychrom)

                geno.addTaxon(tax_id, gene_id)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

            gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
            gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
            gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
            gu.loadAllProperties(g)

        return
Пример #36
0
class OMIA(Source):
    """
    This is the parser for the
    [Online Mendelian Inheritance in Animals
    (OMIA)](http://www.http://omia.angis.org.au),
    from which we process inherited disorders, other (single-locus) traits,
    and genes in >200 animal species (other than human and mouse and rats).

    We generate the omia graph to include the following information:
    * genes
    * animal taxonomy, and breeds as instances of those taxa
        (breeds are akin to "strains" in other taxa)
    * animal diseases, along with species-specific subtypes of those diseases
    * publications (and their mapping to PMIDs, if available)
    * gene-to-phenotype associations (via an anonymous variant-locus
    * breed-to-phenotype associations

    We make links between OMIA and OMIM in two ways:
    1.  mappings between OMIA and OMIM are created as OMIA --> hasdbXref OMIM
    2.  mappings between a breed and OMIA disease are created
        to be a model for the mapped OMIM disease,
        IF AND ONLY IF it is a 1:1 mapping.
        there are some 1:many mappings,
        and these often happen if the OMIM item is a gene.

    Because many of these species are not covered in
    the PANTHER orthology datafiles, we also pull any orthology
    relationships from the gene_group files from NCBI.

    """

    files = {
        'data': {
            'file': 'omia.xml.gz',
            'url': 'http://omia.angis.org.au/dumps/omia.xml.gz'},
    }

    def __init__(self):
        Source.__init__(self, 'omia')

        self.load_bindings()

        self.dataset = Dataset(
            'omia', 'Online Mendelian Inheritance in Animals',
            'http://omia.angis.org.au', None, None,
            'http://sydney.edu.au/disclaimer.shtml')

        self.id_hash = {
            'article': {},
            'phene': {},
            'breed': {},
            'taxon': {},
            'gene': {}
        }
        self.label_hash = {}
        self.gu = GraphUtils(curie_map.get())
        # used to store the omia to omim phene mappings
        self.omia_omim_map = {}
        # used to store the unique genes that have phenes
        # (for fetching orthology)
        self.annotated_genes = set()

        self.test_ids = {
            'disease': [
                'OMIA:001702', 'OMIA:001867', 'OMIA:000478', 'OMIA:000201',
                'OMIA:000810', 'OMIA:001400'],
            'gene': [
                492297, 434, 492296, 3430235, 200685834, 394659996, 200685845,
                28713538, 291822383],
            'taxon': [9691, 9685, 9606, 9615, 9913, 93934, 37029, 9627, 9825],
            # to be filled in during parsing of breed table
            # for lookup by breed-associations
            'breed': []
        }
        # to store a map of omia ids and any molecular info
        # to write a report for curation
        self.stored_omia_mol_gen = {}
        self.g = self.graph
        self.geno = Genotype(self.g)
        return

    def fetch(self, is_dl_forced=False):
        """
        :param is_dl_forced:
        :return:
        """
        self.get_files(is_dl_forced)

        ncbi = NCBIGene()
        # ncbi.fetch()
        gene_group = ncbi.files['gene_group']
        self.fetch_from_url(
            gene_group['url'], '/'.join((ncbi.rawdir, gene_group['file'])),
            False)

        return

    def parse(self, limit=None):
        # names of tables to iterate - probably don't need all these:
        # Article_Breed, Article_Keyword, Article_Gene, Article_Keyword,
        # Article_People, Article_Phene, Articles, Breed, Breed_Phene,
        # Genes_gb, Group_Categories, Group_MPO, Inherit_Type, Keywords,
        # Landmark, Lida_Links, OMIA_Group, OMIA_author, Omim_Xref, People,
        # Phene, Phene_Gene, Publishers, Resources, Species_gb, Synonyms

        self.scrub()

        if limit is not None:
            logger.info("Only parsing first %d rows", limit)

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        if self.testMode:
            self.g = self.testgraph
        else:
            self.g = self.graph
        self.geno = Genotype(self.g)

        # we do three passes through the file
        # first process species (two others reference this one)
        self.process_species(limit)

        # then, process the breeds, genes, articles, and other static stuff
        self.process_classes(limit)

        # next process the association data
        self.process_associations(limit)

        # process the vertebrate orthology for genes
        # that are annotated with phenotypes
        ncbi = NCBIGene()
        ncbi.add_orthologs_by_gene_group(self.g, self.annotated_genes)

        self.load_core_bindings()
        self.load_bindings()

        logger.info("Done parsing.")

        self.write_molgen_report()

        return

    def scrub(self):
        """
        The XML file seems to have mixed-encoding;
        we scrub out the control characters
        from the file for processing.
        :return:

        """

        logger.info(
            "Scrubbing out the nasty characters that break our parser.")

        myfile = '/'.join((self.rawdir, self.files['data']['file']))
        tmpfile = '/'.join((self.rawdir, self.files['data']['file']+'.tmp.gz'))
        t = gzip.open(tmpfile, 'wb')
        du = DipperUtil()
        with gzip.open(myfile, 'rb') as f:
            filereader = io.TextIOWrapper(f, newline="")
            for l in filereader:
                l = du.remove_control_characters(l) + '\n'
                t.write(l.encode('utf-8'))
        t.close()

        # move the temp file
        logger.info("Replacing the original data with the scrubbed file.")
        shutil.move(tmpfile, myfile)
        return

    # ###################### XML LOOPING FUNCTIONS ##################

    def process_species(self, limit):
        """
        Loop through the xml file and process the species.
        We add elements to the graph, and store the
        id-to-label in the label_hash dict.
        :param limit:
        :return:
        """

        myfile = '/'.join((self.rawdir, self.files['data']['file']))

        f = gzip.open(myfile, 'rb')
        filereader = io.TextIOWrapper(f, newline="")

        filereader.readline()  # remove the xml declaration line

        for event, elem in ET.iterparse(filereader):
            # Species ids are == genbank species ids!
            self.process_xml_table(
                elem, 'Species_gb', self._process_species_table_row, limit)

        f.close()

        return

    def process_classes(self, limit):
        """
        Loop through the xml file and process the articles,
        breed, genes, phenes, and phenotype-grouping classes.
        We add elements to the graph,
        and store the id-to-label in the label_hash dict,
        along with the internal key-to-external id in the id_hash dict.
        The latter are referenced in the association processing functions.

        :param limit:
        :return:

        """

        myfile = '/'.join((self.rawdir, self.files['data']['file']))

        f = gzip.open(myfile, 'rb')
        filereader = io.TextIOWrapper(f, newline="")

        filereader.readline()  # remove the xml declaration line

        parser = ET.XMLParser(encoding='utf-8')

        for event, elem in ET.iterparse(filereader, parser=parser):
            self.process_xml_table(
                elem, 'Articles', self._process_article_row, limit)
            self.process_xml_table(
                elem, 'Breed', self._process_breed_row, limit)
            self.process_xml_table(
                elem, 'Genes_gb', self._process_gene_row, limit)
            self.process_xml_table(
                elem, 'OMIA_Group', self._process_omia_group_row, limit)
            self.process_xml_table(
                elem, 'Phene', self._process_phene_row, limit)
            self.process_xml_table(
                elem, 'Omim_Xref', self._process_omia_omim_map, limit)

        f.close()

        # post-process the omia-omim associations to filter out the genes
        # (keep only phenotypes/diseases)
        self.clean_up_omim_genes()

        return

    def process_associations(self, limit):
        """
        Loop through the xml file and process the article-breed, article-phene,
        breed-phene, phene-gene associations, and the external links to LIDA.

        :param limit:
        :return:

        """

        myfile = '/'.join((self.rawdir, self.files['data']['file']))

        f = gzip.open(myfile, 'rb')
        filereader = io.TextIOWrapper(f, newline="")

        filereader.readline()  # remove the xml declaration line

        for event, elem in ET.iterparse(filereader):
            self.process_xml_table(
                elem, 'Article_Breed', self._process_article_breed_row, limit)
            self.process_xml_table(
                elem, 'Article_Phene', self._process_article_phene_row, limit)
            self.process_xml_table(
                elem, 'Breed_Phene', self._process_breed_phene_row, limit)
            self.process_xml_table(
                elem, 'Lida_Links', self._process_lida_links_row, limit)
            self.process_xml_table(
                elem, 'Phene_Gene', self._process_phene_gene_row, limit)
            self.process_xml_table(
                elem, 'Group_MPO', self._process_group_mpo_row, limit)

        f.close()

        return

    # ############ INDIVIDUAL TABLE-LEVEL PROCESSING FUNCTIONS ################

    def _process_species_table_row(self, row):
        # gb_species_id, sci_name, com_name, added_by, date_modified
        tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
        sci_name = row['sci_name']
        com_name = row['com_name']

        if self.testMode and \
                (int(row['gb_species_id']) not in self.test_ids['taxon']):
            return

        self.gu.addClassToGraph(self.g, tax_id, sci_name)
        if com_name != '':
            self.gu.addSynonym(self.g, tax_id, com_name)
            self.label_hash[tax_id] = com_name  # for lookup later
        else:
            self.label_hash[tax_id] = sci_name

        return

    def _process_breed_row(self, row):

        # in test mode, keep all breeds of our test species
        if self.testMode and \
                (int(row['gb_species_id']) not in self.test_ids['taxon']):
            return

        # save the breed keys in the test_ids for later processing
        self.test_ids['breed'] += [int(row['breed_id'])]

        breed_id = self.make_breed_id(row['breed_id'])

        self.id_hash['breed'][row['breed_id']] = breed_id
        tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
        breed_label = row['breed_name']
        species_label = self.label_hash.get(tax_id)
        if species_label is not None:
            breed_label = breed_label + ' ('+species_label+')'

        self.gu.addIndividualToGraph(self.g, breed_id, breed_label, tax_id)
        self.label_hash[breed_id] = breed_label

        return

    def _process_phene_row(self, row):

        phenotype_id = None
        sp_phene_label = row['phene_name']
        if sp_phene_label == '':
            sp_phene_label = None
        if 'omia_id' not in row:
            logger.info("omia_id not present for %s", row['phene_id'])
            omia_id = self._make_internal_id('phene', phenotype_id)
        else:
            omia_id = 'OMIA:'+str(row['omia_id'])

        if self.testMode and not\
                (int(row['gb_species_id']) in self.test_ids['taxon'] and
                 omia_id in self.test_ids['disease']):
            return
        # add to internal hash store for later lookup
        self.id_hash['phene'][row['phene_id']] = omia_id

        descr = row['summary']
        if descr == '':
            descr = None

        # omia label
        omia_label = self.label_hash.get(omia_id)

        # add the species-specific subclass (TODO please review this choice)
        gb_species_id = row['gb_species_id']

        if gb_species_id != '':
            sp_phene_id = '-'.join((omia_id, gb_species_id))
        else:
            logger.error(
                "No species supplied in species-specific phene table for %s",
                omia_id)
            return

        species_id = 'NCBITaxon:'+str(gb_species_id)
        # use this instead
        species_label = self.label_hash.get('NCBITaxon:'+gb_species_id)
        if sp_phene_label is None and \
                omia_label is not None and species_label is not None:
            sp_phene_label = ' '.join((omia_label, 'in', species_label))
        self.gu.addClassToGraph(
            self.g, sp_phene_id, sp_phene_label, omia_id, descr)
        # add to internal hash store for later lookup
        self.id_hash['phene'][row['phene_id']] = sp_phene_id
        self.label_hash[sp_phene_id] = sp_phene_label
        # add each of the following descriptions,
        # if they are populated, with a tag at the end.
        for item in [
                'clin_feat', 'history', 'pathology', 'mol_gen', 'control']:
            if row[item] is not None and row[item] != '':
                self.gu.addDescription(
                    self.g, sp_phene_id, row[item] + ' ['+item+']')
        # if row['symbol'] is not None:  # species-specific
        # CHECK ME - sometimes spaces or gene labels
        #     gu.addSynonym(g, sp_phene, row['symbol'])

        self.gu.addOWLPropertyClassRestriction(
            self.g, sp_phene_id, self.gu.object_properties['in_taxon'],
            species_id)

        # add inheritance as an association
        inheritance_id = self._map_inheritance_term_id(row['inherit'])
        if inheritance_id is not None:
            assoc = DispositionAssoc(self.name, sp_phene_id, inheritance_id)
            assoc.add_association_to_graph(self.g)

        if row['characterised'] == 'Yes':
            self.stored_omia_mol_gen[omia_id] = {
                'mol_gen': row['mol_gen'],
                'map_info': row['map_info'],
                'species': row['gb_species_id']}

        return

    def write_molgen_report(self):
        import csv
        logger.info("Writing G2P report for OMIA")
        f = '/'.join((self.outdir, 'omia_molgen_report.txt'))

        with open(f, 'w', newline='\n') as csvfile:
            writer = csv.writer(csvfile, delimiter='\t')
            # write header
            h = ['omia_id', 'molecular_description', 'mapping_info', 'species']
            writer.writerow(h)
            for phene in self.stored_omia_mol_gen:
                writer.writerow((str(phene),
                                 self.stored_omia_mol_gen[phene]['mol_gen'],
                                 self.stored_omia_mol_gen[phene]['map_info'],
                                 self.stored_omia_mol_gen[phene]['species']))

        logger.info(
            "Wrote %d potential G2P descriptions for curation to %s",
            len(self.stored_omia_mol_gen), f)

        return

    def _process_article_row(self, row):

        # don't bother in test mode
        if self.testMode:
            return

        iarticle_id = self._make_internal_id('article', row['article_id'])
        self.id_hash['article'][row['article_id']] = iarticle_id
        rtype = None
        if row['journal'] != '':
            rtype = Reference.ref_types['journal_article']
        r = Reference(iarticle_id, rtype)

        if row['title'] is not None:
            r.setTitle(row['title'].strip())
        if row['year'] is not None:
            r.setYear(row['year'])
        r.addRefToGraph(self.g)

        if row['pubmed_id'] is not None:
            pmid = 'PMID:'+str(row['pubmed_id'])
            self.id_hash['article'][row['article_id']] = pmid
            self.gu.addSameIndividual(self.g, iarticle_id, pmid)
            self.gu.addComment(self.g, pmid, iarticle_id)

        return

    def _process_omia_group_row(self, row):
        omia_id = 'OMIA:'+row['omia_id']

        if self.testMode and omia_id not in self.test_ids['disease']:
            return

        group_name = row['group_name']
        group_summary = row['group_summary']

        disease_id = None
        group_category = row.get('group_category')
        disease_id = \
            self.map_omia_group_category_to_ontology_id(group_category)
        if disease_id is not None:
            self.gu.addClassToGraph(self.g, disease_id, None)
            if disease_id == 'MP:0008762':  # embryonic lethal
                # add this as a phenotype association
                # add embryonic onset
                assoc = D2PAssoc(self.name, omia_id, disease_id)
                assoc.add_association_to_graph(self.g)
                disease_id = None
        else:
            logger.info(
                "No disease superclass defined for %s:  %s",
                omia_id, group_name)
            # default to general disease  FIXME this may not be desired
            disease_id = 'DOID:4'

        if group_summary == '':
            group_summary = None
        if group_name == '':
            group_name = None

        self.gu.addClassToGraph(
            self.g, omia_id, group_name, disease_id, group_summary)

        self.label_hash[omia_id] = group_name

        return

    def _process_gene_row(self, row):
        if self.testMode and row['gene_id'] not in self.test_ids['gene']:
            return
        gene_id = 'NCBIGene:'+str(row['gene_id'])
        self.id_hash['gene'][row['gene_id']] = gene_id
        gene_label = row['symbol']
        self.label_hash[gene_id] = gene_label
        tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
        gene_type_id = NCBIGene.map_type_of_gene(row['gene_type'])
        self.gu.addClassToGraph(self.g, gene_id, gene_label, gene_type_id)
        self.geno.addTaxon(tax_id, gene_id)

        return

    def _process_article_breed_row(self, row):
        # article_id, breed_id, added_by
        # don't bother putting these into the test... too many!

        # and int(row['breed_id']) not in self.test_ids['breed']:
        if self.testMode:
            return

        article_id = self.id_hash['article'].get(row['article_id'])
        breed_id = self.id_hash['breed'].get(row['breed_id'])

        # there's some missing data (article=6038).  in that case skip
        if article_id is not None:
            self.gu.addTriple(
                self.g, article_id, self.gu.object_properties['is_about'],
                breed_id)
        else:
            logger.warning("Missing article key %s", str(row['article_id']))

        return

    def _process_article_phene_row(self, row):
        """
        Linking articles to species-specific phenes.

        :param row:
        :return:
        """
        # article_id, phene_id, added_by
        # look up the article in the hashmap
        phenotype_id = self.id_hash['phene'].get(row['phene_id'])
        article_id = self.id_hash['article'].get(row['article_id'])

        omia_id = self._get_omia_id_from_phene_id(phenotype_id)
        if self.testMode and omia_id not in self.test_ids['disease'] \
                or phenotype_id is None or article_id is None:
            return

        # make a triple, where the article is about the phenotype
        self.gu.addTriple(
            self.g, article_id,
            self.gu.object_properties['is_about'], phenotype_id)

        return

    def _process_breed_phene_row(self, row):
        # Linking disorders/characteristic to breeds
        # breed_id, phene_id, added_by
        breed_id = self.id_hash['breed'].get(row['breed_id'])
        phene_id = self.id_hash['phene'].get(row['phene_id'])

        # get the omia id
        omia_id = self._get_omia_id_from_phene_id(phene_id)

        if (self.testMode and not (
                omia_id in self.test_ids['disease'] and
                int(row['breed_id']) in self.test_ids['breed']) or
                breed_id is None or phene_id is None):
            return

        # FIXME we want a different relationship here
        assoc = G2PAssoc(
            self.name, breed_id, phene_id,
            self.gu.object_properties['has_phenotype'])
        assoc.add_association_to_graph(self.g)

        # add that the breed is a model of the human disease
        # use the omia-omim mappings for this
        # we assume that we have already scrubbed out the genes
        # from the omim list, so we can make the model associations here

        omim_ids = self.omia_omim_map.get(omia_id)
        eco_id = "ECO:0000214"   # biological aspect of descendant evidence
        if omim_ids is not None and len(omim_ids) > 0:
            if len(omim_ids) > 1:
                logger.info(
                    "There's 1:many omia:omim mapping: %s, %s",
                    omia_id, str(omim_ids))
            for i in omim_ids:
                assoc = G2PAssoc(
                    self.name, breed_id, i,
                    self.gu.object_properties['model_of'])
                assoc.add_evidence(eco_id)
                assoc.add_association_to_graph(self.g)
                aid = assoc.get_association_id()

                breed_label = self.label_hash.get(breed_id)
                if breed_label is None:
                    breed_label = "this breed"

                m = re.search(r'\((.*)\)', breed_label)
                if m:
                    sp_label = m.group(1)
                else:
                    sp_label = ''

                phene_label = self.label_hash.get(phene_id)
                if phene_label is None:
                    phene_label = "phenotype"
                elif phene_label.endswith(sp_label):
                    # some of the labels we made already include the species;
                    # remove it to make a cleaner desc
                    phene_label = re.sub(r' in '+sp_label, '', phene_label)
                desc = ' '.join(
                    ("High incidence of", phene_label, "in", breed_label,
                     "suggests it to be a model of disease", i + "."))
                self.gu.addDescription(self.g, aid, desc)
        return

    def _process_lida_links_row(self, row):
        # lidaurl, omia_id, added_by
        omia_id = 'OMIA:'+row['omia_id']
        lidaurl = row['lidaurl']

        if self.testMode and omia_id not in self.test_ids['disease']:
            return

        self.gu.addXref(self.g, omia_id, lidaurl, True)

        return

    def _process_phene_gene_row(self, row):

        gene_id = self.id_hash['gene'].get(row['gene_id'])
        phene_id = self.id_hash['phene'].get(row['phene_id'])

        omia_id = self._get_omia_id_from_phene_id(phene_id)

        if self.testMode and not (
                omia_id in self.test_ids['disease'] and
                row['gene_id'] in self.test_ids['gene']) or\
                gene_id is None or phene_id is None:
            return

        # occasionally some phenes are missing!  (ex: 406)
        if phene_id is None:
            logger.warning("Phene id %s is missing", str(row['phene_id']))
            return

        gene_label = self.label_hash[gene_id]
        # some variant of gene_id has phenotype d
        vl = '_'+re.sub(r'NCBIGene:', '', str(gene_id)) + 'VL'
        if self.nobnodes:
            vl = ':'+vl
        self.geno.addAllele(vl, 'some variant of ' + gene_label)
        self.geno.addAlleleOfGene(vl, gene_id)
        assoc = G2PAssoc(self.name, vl, phene_id)
        assoc.add_association_to_graph(self.g)

        # add the gene id to the set of annotated genes
        # for later lookup by orthology
        self.annotated_genes.add(gene_id)

        return

    def _process_omia_omim_map(self, row):
        """
        Links OMIA groups to OMIM equivalents.
        :param row:
        :return:
        """
        # omia_id, omim_id, added_by

        omia_id = 'OMIA:'+row['omia_id']
        omim_id = 'OMIM:'+row['omim_id']

        # also store this for use when we say that a given animal is
        # a model of a disease
        if omia_id not in self.omia_omim_map:
            self.omia_omim_map[omia_id] = set()
        self.omia_omim_map[omia_id].add(omim_id)

        if self.testMode and omia_id not in self.test_ids['disease']:
            return

        self.gu.addXref(self.g, omia_id, omim_id)

        return

    def map_omia_group_category_to_ontology_id(self, category_num):
        """
        Using the category number in the OMIA_groups table,
        map them to a disease id.
        This may be superceeded by other MONDO methods.

        Platelet disorders will be more specific once
        https://github.com/obophenotype/human-disease-ontology/issues/46
        is fulfilled.

        :param category_num:
        :return:

        """

        category_map = {
            1: 'DOID:0014667',      # Inborn error of metabolism
            2: 'MESH:D004392',      # Dwarfism
            3: 'DOID:1682',         # congenital heart disease
            4: 'DOID:74',           # blood system disease
            5: 'DOID:3211',         # lysosomal storage disease
            6: 'DOID:16',           # integumentary system disease
            # --> retinal degeneration ==> OMIA:000830
            7: 'DOID:8466',         # progressive retinal atrophy
            8: 'DOID:0050572',      # Cone–rod dystrophy
            9: 'MESH:C536122',      # stationary night blindness
            10: 'Orphanet:98553',   # developmental retinal disorder
            11: 'DOID:5679',        # retinal disorder
            12: 'Orphanet:90771',   # Disorder of Sex Development
            #  - what to do about this one?
            13: 'MP:0008762',       # embryonic lethal
            # - not sure what to do with this
            14: None,               # blood group
            # FIXME make me more specific
            15: 'DOID:2218',        # intrinsic platelet disorder
            # FIXME make me more specific
            16: 'DOID:2218',        # extrinsic platelet disorder
            17: None  # transgenic ???
        }

        disease_id = None
        if category_num is not None and int(category_num) in category_map:
            disease_id = category_map.get(int(category_num))
            logger.info(
                "Found %s for category %s", str(disease_id), str(category_num))
        else:
            logger.info(
                "There's a group category I don't know anything about: %s",
                str(category_num))

        return disease_id

    def _process_group_mpo_row(self, row):
        """
        Make OMIA to MP associations
        :param row:
        :return:
        """
        omia_id = 'OMIA:'+row['omia_id']
        mpo_num = int(row['MPO_no'])
        mpo_id = 'MP:'+str(mpo_num).zfill(7)

        assoc = D2PAssoc(self.name, omia_id, mpo_id)
        assoc.add_association_to_graph(self.g)

        return

    def clean_up_omim_genes(self):
        omim = OMIM()
        # get all the omim ids
        allomimids = set()
        for omia in self.omia_omim_map:
            allomimids.update(self.omia_omim_map[omia])

        entries_that_are_phenotypes = omim.process_entries(
            list(allomimids), filter_keep_phenotype_entry_ids, None, None)
        logger.info(
            "Filtered out %d/%d entries that are genes or features",
            len(allomimids)-len(entries_that_are_phenotypes), len(allomimids))

        # now iterate again and remove those non-phenotype ids
        removed_count = 0
        for omia in self.omia_omim_map:
            ids = self.omia_omim_map[omia]
            cleanids = set()
            for i in ids:
                if i in entries_that_are_phenotypes:
                    cleanids.add(i)
                else:
                    removed_count += 1  # keep track of how many we've removed
            self.omia_omim_map[omia] = cleanids

        logger.info(
            "Removed %d omim ids from the omia-to-omim map", removed_count)

        return

    def _make_internal_id(self, prefix, key):

        iid = '_'+''.join(('omia', prefix, 'key', str(key)))
        if self.nobnodes:
            iid = ':'+iid

        return iid

    def make_breed_id(self, key):
        breed_id = 'OMIA-breed:'+str(key)

        return breed_id

    @staticmethod
    def _get_omia_id_from_phene_id(phene_id):
        omia_id = None
        if phene_id is not None:
            m = re.match(r'OMIA:\d+', str(phene_id))
            if m:
                omia_id = m.group(0)

        return omia_id

    @staticmethod
    def _map_inheritance_term_id(inheritance_symbol):

        inherit_map = {
            'A':  None,  # Autosomal
            'ACD': 'GENO:0000143',  # Autosomal co-dominant
            'ADV': None,  # autosomal dominant with variable expressivity
            'AID': 'GENO:0000259',  # autosomal incompletely dominant
            'ASD': 'GENO:0000145',  # autosomal semi-dominant
            # autosomal recessive, semi-lethal
            # using generic autosomal recessive
            'ASL': 'GENO:0000150',
            'D': 'GENO:0000147',  # autosomal dominant
            'M': None,  # multifactorial
            'MAT': None,  # Maternal
            # probably autosomal recessive
            # using generic autosomal recessive
            'PR':  'GENO:0000150',
            'R': 'GENO:0000150',  # Autosomal Recessive
            # Recessive Embryonic Lethal
            # using plain recessive
            'REL': 'GENO:0000148',
            # Autosomal Recessive Lethal
            # using plain autosomal recessive
            'RL': 'GENO:0000150',
            'S': 'GENO:0000146',  # Sex-linked   <--using allosomal dominant
            'SLi': None,  # Sex-limited
            'UD': 'GENO:0000144',  # Dominant
            'X': None,  # x-linked    # HP:0001417 ?
            # X-linked Dominant     <-- temp using allosomal dominant  FIXME
            'XLD': 'GENO:0000146',
            # X-linked Recessive    <-- temp using allosomal recessive  FIXME
            'XLR': 'GENO:0000149',
            'Y': None,  # Y-linked
            'Z': None,  # Z-linked
            # Z-linked recessive    <-- temp using allosomal recessive  FIXME
            'ZR': 'GENO:0000149',
            '999': None,  # Z-linked incompletely dominant
        }

        inheritance_id = inherit_map.get(inheritance_symbol)
        if inheritance_id is None and inheritance_symbol is not None:
            logger.warning(
                "No inheritance id is mapped for %s", inheritance_symbol)

        return inheritance_id

    def getTestSuite(self):
        import unittest
        from tests.test_omia import OMIATestCase

        test_suite = unittest.TestLoader().loadTestsFromTestCase(OMIATestCase)

        return test_suite
Пример #37
0
    def process_gaf(self, gaffile, limit, id_map=None, eco_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", gaffile)
        uniprot_hit = 0
        uniprot_miss = 0
        col = self.gaf_columns

        with gzip.open(gaffile, 'rb') as csvfile:
            reader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"')
            for row in reader:
                # comments start with exclamation
                if row[0][0] == '!':
                    continue

                if len(row) != len(col):
                    LOG.error(
                        "Wrong number of columns %i, expected ... got:\n\t%s",
                        len(col), row)
                    exit(1)

                dbase = row[col.index('DB')].strip()
                gene_num = row[col.index('DB_Object_ID')].strip()
                gene_symbol = row[col.index('DB_Object_Symbol')].strip()
                qualifier = row[col.index('Qualifier')]
                go_id = row[col.index('GO_ID')].strip()
                ref = row[col.index('DB:Reference')].strip()
                eco_symbol = row[col.index('Evidence Code')].strip()
                with_or_from = row[col.index('With (or) From')]
                aspect = row[col.index('Aspect')].strip()
                gene_name = row[col.index('DB_Object_Name')]
                gene_synonym = row[col.index('DB_Object_Synonym')]
                # object_type = row[col.index('DB_Object_Type')].strip()
                taxon = row[col.index('Taxon and Interacting taxon')].strip()
                # date = row[col.index('Date')].strip()
                # assigned_by = row[col.index('Assigned_By')].strip()
                # annotation_extension = row[col.index('Annotation_Extension')]
                # gene_product_form_id = row[col.index('Gene_Product_Form_ID')]

                # test for required fields
                if '' in [row[:10], row[12]]:
                    LOG.error(
                        "Missing required part of annotation on row %i:\n%s",
                        reader.line_num, str(row[:-4]))
                    continue

                # (Don't) deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None and gene_num in id_map:
                        gene_id = id_map[gene_num]
                        uniprotid = ':'.join((dbase, gene_num))
                        (dbase, gene_num) = gene_id.split(':')
                        uniprot_hit += 1
                    else:
                        # LOG.warning(
                        #   "UniProt id %s is without a 1:1 mapping to entrez/ensembl",
                        #    gene_num)
                        uniprot_miss += 1
                        continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and gene_id[:9] != 'NCBIGene:' and\
                        gene_num not in self.test_ids:
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        syn = syn.strip()
                        if syn[:10] == 'UniProtKB:':
                            model.addTriple(
                                gene_id, self.globaltt['has gene product'], syn)
                        elif re.fullmatch(graph.curie_regexp, syn) is not None:
                            LOG.warning(
                                'possible curie "%s" as a literal synomym for %s',
                                syn, gene_id)
                            model.addSynonym(gene_id, syn)
                        else:
                            model.addSynonym(gene_id, syn)

                for txid in taxon.split('|'):
                    tax_curie = re.sub(r'taxon:', 'NCBITaxon:', txid)
                    geno.addTaxon(tax_curie, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = eco_map[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[0]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to', qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n", str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                ########################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = with_or_from.split('|')
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for itm in withitems:
                        if itm == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', itm):
                            LOG.warning(
                                "Skipping  %s from or with %s", uniprotid, itm)
                            continue
                        itm = re.sub(r'MGI\:MGI\:', 'MGI:', itm)
                        itm = re.sub(r'WB:', 'WormBase:', itm)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', itm):
                            targeted_gene_id = self.zfin.make_targeted_gene_id(
                                gene_id, itm)
                            geno.addReagentTargetedGene(itm, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(
                                graph, self.name, targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', itm):
                            targeted_gene_id = self.wbase.make_reagent_targeted_gene_id(
                                gene_id, itm)
                            geno.addReagentTargetedGene(itm, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(
                                graph, self.name, targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, itm, phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[0]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(
                                    self.globaltt['experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be the evidence for the GO assoc?

                if not self.test_mode and limit is not None and \
                        reader.line_num > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download",
                uniprot_per, uniprot_tot)
Пример #38
0
    def _process_data(self, raw, limit=None):
        logger.info("Processing Data from %s", raw)
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)
        line_counter = 0
        gu.loadAllProperties(g)
        gu.loadObjectProperties(g, geno.object_properties)

        # Add the taxon as a class
        taxon_id = 'NCBITaxon:10090'  # map to Mus musculus
        gu.addClassToGraph(g, taxon_id, None)

        # with open(raw, 'r', encoding="utf8") as csvfile:
        with gzip.open(raw, 'rt') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                line_counter += 1

                (marker_accession_id, marker_symbol, phenotyping_center,
                 colony, sex, zygosity, allele_accession_id, allele_symbol,
                 allele_name, strain_accession_id, strain_name, project_name,
                 project_fullname, pipeline_name, pipeline_stable_id,
                 procedure_stable_id, procedure_name, parameter_stable_id,
                 parameter_name, top_level_mp_term_id, top_level_mp_term_name,
                 mp_term_id, mp_term_name, p_value, percentage_change,
                 effect_size, statistical_method, resource_name) = row

                if self.testMode and marker_accession_id not in self.test_ids:
                    continue

                # ##### cleanup some of the identifiers ######
                zygosity_id = self._map_zygosity(zygosity)

                # colony ids sometimes have <> in them, spaces,
                # or other non-alphanumerics and break our system;
                # replace these with underscores
                colony_id = '_'+re.sub(r'\W+', '_', colony)
                if self.nobnodes:
                    colony_id = ':'+colony_id

                if not re.match(r'MGI', allele_accession_id):
                    allele_accession_id = \
                        '_IMPC-'+re.sub(r':', '', allele_accession_id)
                    if self.nobnodes:
                        allele_accession_id = ':'+allele_accession_id
                if re.search(r'EUROCURATE', strain_accession_id):
                    # the eurocurate links don't resolve at IMPC
                    strain_accession_id = '_'+strain_accession_id
                    if self.nobnodes:
                        strain_accession_id = ':'+strain_accession_id
                elif not re.match(r'MGI', strain_accession_id):
                    logger.info(
                        "Found a strange strain accession...%s",
                        strain_accession_id)
                    strain_accession_id = 'IMPC:'+strain_accession_id

                ######################
                # first, add the marker and variant to the graph as with MGI,
                # the allele is the variant locus.  IF the marker is not known,
                # we will call it a sequence alteration.  otherwise,
                # we will create a BNode for the sequence alteration.
                sequence_alteration_id = variant_locus_id = None
                variant_locus_name = sequence_alteration_name = None

                # extract out what's within the <> to get the symbol
                if re.match(r'.*<.*>', allele_symbol):
                    sequence_alteration_name = \
                        re.match(r'.*<(.*)>', allele_symbol).group(1)
                else:
                    sequence_alteration_name = allele_symbol

                if marker_accession_id is not None and \
                        marker_accession_id == '':
                    logger.warning(
                        "Marker unspecified on row %d", line_counter)
                    marker_accession_id = None

                if marker_accession_id is not None:
                    variant_locus_id = allele_accession_id
                    variant_locus_name = allele_symbol
                    variant_locus_type = geno.genoparts['variant_locus']
                    geno.addGene(marker_accession_id, marker_symbol,
                                 geno.genoparts['gene'])
                    geno.addAllele(variant_locus_id, variant_locus_name,
                                   variant_locus_type, None)
                    geno.addAlleleOfGene(variant_locus_id, marker_accession_id)

                    sequence_alteration_id = \
                        '_seqalt'+re.sub(r':', '', allele_accession_id)
                    if self.nobnodes:
                        sequence_alteration_id = ':'+sequence_alteration_id
                    geno.addSequenceAlterationToVariantLocus(
                        sequence_alteration_id, variant_locus_id)

                else:
                    sequence_alteration_id = allele_accession_id

                # IMPC contains targeted mutations with either gene traps,
                # knockouts, insertion/intragenic deletions.
                # but I don't really know what the SeqAlt is here,
                # so I don't add it.
                geno.addSequenceAlteration(sequence_alteration_id,
                                           sequence_alteration_name)

                # #############    BUILD THE COLONY    #############
                # First, let's describe the colony that the animals come from
                # The Colony ID refers to the ES cell clone
                #   used to generate a mouse strain.
                # Terry sez: we use this clone ID to track
                #   ES cell -> mouse strain -> mouse phenotyping.
                # The same ES clone maybe used at multiple centers,
                # so we have to concatenate the two to have a unique ID.
                # some useful reading about generating mice from ES cells:
                # http://ki.mit.edu/sbc/escell/services/details

                # here, we'll make a genotype
                # that derives from an ES cell with a given allele.
                # the strain is not really attached to the colony.

                # the colony/clone is reflective of the allele,
                # with unknown zygosity
                stem_cell_class = 'ERO:0002002'
                gu.addIndividualToGraph(g, colony_id, colony, stem_cell_class)

                # vslc of the colony has unknown zygosity
                # note that we will define the allele
                # (and it's relationship to the marker, etc.) later
                # FIXME is it really necessary to create this vslc
                # when we always know it's unknown zygosity?
                vslc_colony = \
                    '_'+allele_accession_id+geno.zygosity['indeterminate']
                vslc_colony = re.sub(r':', '', vslc_colony)
                if self.nobnodes:
                    vslc_colony = ':'+vslc_colony
                vslc_colony_label = allele_symbol+'/<?>'
                # for ease of reading, we make the colony genotype variables.
                # in the future, it might be desired to keep the vslcs
                colony_genotype_id = vslc_colony
                colony_genotype_label = vslc_colony_label
                geno.addGenotype(colony_genotype_id, colony_genotype_label)
                geno.addParts(allele_accession_id, colony_genotype_id,
                              geno.object_properties['has_alternate_part'])
                geno.addPartsToVSLC(
                    vslc_colony, allele_accession_id, None,
                    geno.zygosity['indeterminate'],
                    geno.object_properties['has_alternate_part'])
                gu.addTriple(
                    g, colony_id,
                    geno.object_properties['has_genotype'],
                    colony_genotype_id)

                # ##########    BUILD THE ANNOTATED GENOTYPE    ##########
                # now, we'll build the genotype of the individual that derives
                # from the colony/clone genotype that is attached to
                # phenotype = colony_id + strain + zygosity + sex
                # (and is derived from a colony)

                # this is a sex-agnostic genotype
                genotype_id = \
                    self.make_id(
                        (colony_id + phenotyping_center + zygosity +
                         strain_accession_id))
                geno.addSequenceDerivesFrom(genotype_id, colony_id)

                # build the VSLC of the sex-agnostic genotype
                # based on the zygosity
                allele1_id = allele_accession_id
                allele2_id = allele2_rel = None
                allele1_label = allele_symbol
                allele2_label = '<?>'
                # Making VSLC labels from the various parts,
                # can change later if desired.
                if zygosity == 'heterozygote':
                    allele2_label = re.sub(r'<.*', '<+>', allele1_label)
                    allele2_id = None
                elif zygosity == 'homozygote':
                    allele2_label = allele1_label
                    allele2_id = allele1_id
                    allele2_rel = geno.object_properties['has_alternate_part']
                elif zygosity == 'hemizygote':
                    allele2_label = re.sub(r'<.*', '<0>', allele1_label)
                    allele2_id = None
                elif zygosity == 'not_applicable':
                    allele2_label = re.sub(r'<.*', '<?>', allele1_label)
                    allele2_id = None
                else:
                    logger.warning("found unknown zygosity %s", zygosity)
                    break
                vslc_name = '/'.join((allele1_label, allele2_label))

                # Add the VSLC
                vslc_id = '_' + '-'.join((marker_accession_id,
                                          allele_accession_id, zygosity))
                vslc_id = re.sub(r':', '', vslc_id)
                if self.nobnodes:
                    vslc_id = ':'+vslc_id
                gu.addIndividualToGraph(
                    g, vslc_id, vslc_name,
                    geno.genoparts['variant_single_locus_complement'])
                geno.addPartsToVSLC(
                    vslc_id, allele1_id, allele2_id, zygosity_id,
                    geno.object_properties['has_alternate_part'],
                    allele2_rel)

                # add vslc to genotype
                geno.addVSLCtoParent(vslc_id, genotype_id)

                # note that the vslc is also the gvc
                gu.addType(
                    g, vslc_id,
                    Genotype.genoparts['genomic_variation_complement'])

                # Add the genomic background
                # create the genomic background id and name
                if strain_accession_id != '':
                    genomic_background_id = strain_accession_id
                else:
                    genomic_background_id = None

                genotype_name = vslc_name
                if genomic_background_id is not None:
                    geno.addGenotype(
                        genomic_background_id, strain_name,
                        geno.genoparts['genomic_background'])

                    # make a phenotyping-center-specific strain
                    # to use as the background
                    pheno_center_strain_label = \
                        strain_name + '/' + phenotyping_center
                    pheno_center_strain_id = \
                        '-'.join((re.sub(r':', '', genomic_background_id),
                                  re.sub(r'\s', '_', phenotyping_center)))
                    if not re.match(r'^_', pheno_center_strain_id):
                        pheno_center_strain_id = '_'+pheno_center_strain_id
                    if self.nobnodes:
                        pheno_center_strain_id = ':'+pheno_center_strain_id
                    geno.addGenotype(pheno_center_strain_id,
                                     pheno_center_strain_label,
                                     geno.genoparts['genomic_background'])
                    geno.addSequenceDerivesFrom(pheno_center_strain_id,
                                                genomic_background_id)

                    # Making genotype labels from the various parts,
                    # can change later if desired.
                    # since the genotype is reflective of the place
                    # it got made, should put that in to disambiguate
                    genotype_name = \
                        genotype_name+' ['+pheno_center_strain_label+']'
                    geno.addGenomicBackgroundToGenotype(
                        pheno_center_strain_id, genotype_id)
                    geno.addTaxon(pheno_center_strain_id, taxon_id)
                # this is redundant, but i'll keep in in for now
                geno.addSequenceDerivesFrom(genotype_id, colony_id)
                genotype_name += '['+colony+']'
                geno.addGenotype(genotype_id, genotype_name)

                # Make the sex-qualified genotype,
                # which is what the phenotype is associated with
                sex_qualified_genotype_id = \
                    self.make_id(
                        (colony_id + phenotyping_center + zygosity +
                         strain_accession_id+sex))
                sex_qualified_genotype_label = genotype_name+' ('+sex+')'
                if sex == 'male':
                    sq_type_id = geno.genoparts['male_genotype']
                elif sex == 'female':
                    sq_type_id = geno.genoparts['female_genotype']
                else:
                    sq_type_id = geno.genoparts['sex_qualified_genotype']

                geno.addGenotype(
                    sex_qualified_genotype_id,
                    sex_qualified_genotype_label, sq_type_id)
                geno.addParts(
                    genotype_id, sex_qualified_genotype_id,
                    geno.object_properties['has_alternate_part'])

                if genomic_background_id is not None and \
                        genomic_background_id != '':
                    # Add the taxon to the genomic_background_id
                    geno.addTaxon(taxon_id, genomic_background_id)
                else:
                    # add it as the genomic background
                    geno.addTaxon(taxon_id, genotype_id)

                # #############    BUILD THE G2P ASSOC    #############
                # from an old email dated July 23 2014:
                # Phenotypes associations are made to
                # imits colony_id+center+zygosity+gender

                phenotype_id = mp_term_id

                # it seems that sometimes phenotype ids are missing.
                # indicate here
                if phenotype_id is None or phenotype_id == '':
                    logger.warning(
                        "No phenotype id specified for row %d: %s",
                        line_counter, str(row))
                    continue
                # experimental_phenotypic_evidence This was used in ZFIN
                eco_id = "ECO:0000059"

                # the association comes as a result of a g2p from
                # a procedure in a pipeline at a center and parameter tested

                assoc = G2PAssoc(self.name, sex_qualified_genotype_id,
                                 phenotype_id)
                assoc.add_evidence(eco_id)
                # assoc.set_score(float(p_value))

                # TODO add evidence instance using
                # pipeline_stable_id +
                # procedure_stable_id +
                # parameter_stable_id

                assoc.add_association_to_graph(g)
                assoc_id = assoc.get_association_id()

                # add a free-text description
                description = \
                    ' '.join((mp_term_name, 'phenotype determined by',
                              phenotyping_center, 'in an',
                              procedure_name, 'assay where',
                              parameter_name.strip(),
                              'was measured with an effect_size of',
                              str(round(float(effect_size), 5)),
                              '(p =', "{:.4e}".format(float(p_value)), ').'))

                gu.addDescription(g, assoc_id, description)

                # TODO add provenance information
                # resource_id = resource_name
                # assoc.addSource(g, assoc_id, resource_id)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP)
        gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP)

        return
Пример #39
0
    def add_orthologs_by_gene_group(self, graph, gene_ids):
        """
        This will get orthologies between human and other vertebrate genomes
        based on the gene_group annotation pipeline from NCBI.
        More information 9can be learned here:
        http://www.ncbi.nlm.nih.gov/news/03-13-2014-gene-provides-orthologs-regions/
        The method for associations is described in
        [PMCID:3882889](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3882889/)
        == [PMID:24063302](http://www.ncbi.nlm.nih.gov/pubmed/24063302/).
        Because these are only between human and vertebrate genomes,
        they will certainly miss out on very distant orthologies,
        and should not be considered complete.

        We do not run this within the NCBI parser itself;
        rather it is a convenience function for others parsers to call.

        :param graph:
        :param gene_ids:  Gene ids to fetch the orthology
        :return:

        """

        logger.info("getting gene groups")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['gene_group']['file']))
        found_counter = 0
        # because many of the orthologous groups are grouped by human gene,
        # we need to do this by generating two-way hash

        # group_id => orthologs
        # ortholog id => group
        # this will be the fastest approach, though not memory-efficient.
        geno = Genotype(graph)
        model = Model(graph)
        group_to_orthology = {}
        gene_to_group = {}
        gene_to_taxon = {}

        with gzip.open(f, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar='\"')

            for row in filereader:
                # skip comment lines
                if re.match(r'\#', ''.join(row)):
                    continue
                line_counter += 1
                (tax_a, gene_a, rel, tax_b, gene_b) = row

                if rel != 'Ortholog':
                    continue

                if gene_a not in group_to_orthology:
                    group_to_orthology[gene_a] = set()
                group_to_orthology[gene_a].add(gene_b)

                if gene_b not in gene_to_group:
                    gene_to_group[gene_b] = set()
                gene_to_group[gene_b].add(gene_a)

                gene_to_taxon[gene_a] = tax_a
                gene_to_taxon[gene_b] = tax_b

                # also add the group lead as a member of the group
                group_to_orthology[gene_a].add(gene_a)

            # end loop through gene_group file
        logger.debug("Finished hashing gene groups")
        logger.debug("Making orthology associations")
        for gid in gene_ids:
            gene_num = re.sub(r'NCBIGene:', '', gid)
            group_nums = gene_to_group.get(gene_num)
            if group_nums is not None:
                for group_num in group_nums:
                    orthologs = group_to_orthology.get(group_num)
                    if orthologs is not None:
                        for o in orthologs:
                            oid = 'NCBIGene:' + str(o)
                            model.addClassToGraph(oid, None,
                                                  Genotype.genoparts['gene'])
                            otaxid = 'NCBITaxon:' + str(gene_to_taxon[o])
                            geno.addTaxon(otaxid, oid)
                            assoc = OrthologyAssoc(graph, self.name, gid, oid)
                            assoc.add_source('PMID:24063302')
                            assoc.add_association_to_graph()
                            # todo get gene label for orthologs -
                            # this could get expensive
                            found_counter += 1

            # finish loop through annotated genes
        logger.info("Made %d orthology relationships for %d genes",
                    found_counter, len(gene_ids))
        return
Пример #40
0
    def _process_allele_gene(self, limit):
        """
        Make associations between an allele and a gene
        Adds triples to self.graph

        Approach is to use the label nomenclature and species
        map to determine taxon.  Foreign Transgenes are filtered out.

        :param limit: number of rows to process
        :return: None

        """
        geno = Genotype(self.graph)
        species_map = self._species_to_ncbi_tax()
        src_key = 'allele_gene'
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        LOG.info("processing allele to gene")

        col = self.files[src_key]['columns']

        with gzip.open(raw, 'rt') as tsvfile:
            reader = csv.reader(tsvfile, delimiter='\t')
            # skip first line, version info
            next(reader)
            row = next(reader)  # headers
            # header line starts with a hash and tab ??
            row = row[1:]

            self.check_fileheader(col, row)

            for row in reader:
                allele_id = row[col.index('AlleleID')]
                allele_label = row[col.index('AlleleSymbol')]
                gene_id = row[col.index('GeneID')]
                gene_label = row[col.index('GeneSymbol')]

                allele_curie = 'FlyBase:' + allele_id
                gene_curie = 'FlyBase:' + gene_id

                # Add Allele and taxon, skip anything that's not drosophila
                allele_prefix = re.findall(r'^(\w*)\\', allele_label)

                if len(allele_prefix) == 1:
                    try:
                        if species_map[allele_prefix[0]][0] == 'drosophilid':
                            geno.addAllele(allele_curie, allele_label)
                            geno.addTaxon(species_map[allele_prefix[0]][1],
                                          allele_curie)
                        else:
                            # If it's a foreign transgenic allele, skip
                            continue
                    except KeyError:
                        LOG.info("%s not in species prefix file",
                                 allele_prefix[0])
                        continue

                elif not allele_prefix:
                    geno.addAllele(allele_curie, allele_label)
                    geno.addTaxon(self.globaltt['Drosophila melanogaster'],
                                  allele_curie)
                else:
                    raise ValueError(
                        "Did not correctly parse allele label {}".format(
                            allele_label))
                # Process genes
                gene_prefix = re.findall(r'^(\w*)\\', gene_label)

                if len(gene_prefix) == 1:
                    try:
                        geno.addTaxon(species_map[gene_prefix[0]][1],
                                      gene_curie)

                        if species_map[gene_prefix[0]][0] == 'drosophilid':
                            geno.addGene(gene_curie, gene_label)
                        else:
                            # Don't create labels for non drosophila genes
                            geno.addGene(gene_curie)

                    except KeyError:
                        LOG.info("%s not in species prefix file",
                                 gene_prefix[0])
                        geno.addGene(gene_curie)

                elif not gene_prefix:
                    geno.addGene(gene_curie, gene_label)
                    geno.addTaxon(self.globaltt['Drosophila melanogaster'],
                                  allele_curie)
                else:
                    raise ValueError(
                        "Did not correct parse gene label {}".format(
                            gene_label))

                # Connect allele and gene with geno.addAffectedLocus()
                if allele_prefix and gene_prefix:
                    if allele_prefix[0] == gene_prefix[0]:
                        geno.addAffectedLocus(allele_curie, gene_curie)
                    else:
                        raise ValueError(
                            "Found allele and gene with different "
                            "prefixes: {}, {}".format(allele_id, gene_id))
                elif not allele_prefix and gene_prefix:
                    raise ValueError("Found allele and gene with different "
                                     "prefixes: {}, {}".format(
                                         allele_id, gene_id))
                else:
                    # Both are melanogaster
                    geno.addAffectedLocus(allele_curie, gene_curie)

                if limit is not None and reader.line_num > limit:
                    break
Пример #41
0
    def _process_allele_gene(self, limit):
        """
        Make associations between an allele and a gene
        Adds triples to self.graph

        Approach is to use the label nomenclature and species
        map to determine taxon.  Foreign Transgenes are filtered out.

        :param limit: number of rows to process
        :return: None

        """
        geno = Genotype(self.graph)
        species_map = self._species_to_ncbi_tax()
        src_key = 'allele_gene'
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        LOG.info("processing allele to gene")

        col = self.files[src_key]['columns']

        with gzip.open(raw, 'rt') as tsvfile:
            reader = csv.reader(tsvfile, delimiter='\t')
            # skip first line, version info
            next(reader)
            row = next(reader)  # headers
            # header line starts with a hash and tab ??
            row = row[1:]

            self.check_fileheader(col, row)

            for row in reader:
                allele_id = row[col.index('AlleleID')]
                allele_label = row[col.index('AlleleSymbol')]
                gene_id = row[col.index('GeneID')]
                gene_label = row[col.index('GeneSymbol')]

                allele_curie = 'FlyBase:' + allele_id
                gene_curie = 'FlyBase:' + gene_id

                # Add Allele and taxon, skip anything that's not drosophila
                allele_prefix = re.findall(r'^(\w*)\\', allele_label)

                if len(allele_prefix) == 1:
                    try:
                        if species_map[allele_prefix[0]][0] == 'drosophilid':
                            geno.addAllele(allele_curie, allele_label)
                            geno.addTaxon(species_map[allele_prefix[0]][1],
                                          allele_curie)
                        else:
                            # If it's a foreign transgenic allele, skip
                            continue
                    except KeyError:
                        LOG.info("%s not in species prefix file",
                                 allele_prefix[0])
                        note = '''
                            list of unincluded species prefixes include:
                            Aace,Afun,Agos,Ahyp,Amil,Aobl,Apim,Apol,Aque,Asam,AspBV3L6,
                            Avin,Baen,Bant,Bcen,Bdor,Beme,Besp,Bger,Blan,Bovi,Brsp,
                            Bsp240B1,Bsub,Btab,Bter,Bxb1,BYV,CABYV,Cbeta,Ccaj,Cdif,
                            Cfum,Cgri,Cint,Clsp,Cmar,Cnoc,Cpip,Cprd,Cqui,Crub,Csal,
                            CsIV,D6,Dano,Dcaa,Dcol,Dcub,Ddun,DENV,Dflo,Dful,Dmas,Dnep,
                            Drad,Ecab,Efae,Egra,Epos,Equa,EspSC22,Fmer,Gfas,Gint,Gmax,
                            Gmor,Gthe,gypsy,Harm,hobo,HPV18,Hpyl,Hsod,HspTP009,Htur,
                            Hver,Isca,jockey,Klac,Kpne,Lcup,Ldis,Lhem,Lmal,Lmon,Lser,
                            Mani,Mbre,Mosp,Mper,Mril,NDV,Nlug,Npha,Nvec,Nvit,Oari,
                            Obic,Osat,Paer,Pchi,PCV,Penelope,Pgur,Phum,Pime,Pmat,Pshi,
                            Pvin,PVX,Pxyl,Rfla,Rhsp,Rpal,Rsph,Shel,Slit,Soce,Spou,
                            Spyo,Tadh,TBSV,TCV,TEV,Tgeo,Tgon,Tmer,TNPV,TspX513,Tthe,
                            Vcon,Vdes,Vpar,VV,WSSV,Xvas,Zbai,Zbis,ZIKV,Zrou,ZYMV
                        '''
                        continue

                elif not allele_prefix:
                    geno.addAllele(allele_curie, allele_label)
                    geno.addTaxon(self.globaltt['Drosophila melanogaster'],
                                  allele_curie)
                else:
                    raise ValueError(
                        "Did not correctly parse allele label {}".format(
                            allele_label))
                # Process genes
                gene_prefix = re.findall(r'^(\w*)\\', gene_label)

                if len(gene_prefix) == 1:
                    try:
                        geno.addTaxon(species_map[gene_prefix[0]][1],
                                      gene_curie)

                        if species_map[gene_prefix[0]][0] == 'drosophilid':
                            geno.addGene(gene_curie, gene_label)
                        else:
                            # Don't create labels for non drosophila genes
                            geno.addGene(gene_curie)

                    except KeyError:
                        LOG.info("%s not in species prefix file",
                                 gene_prefix[0])
                        geno.addGene(gene_curie)

                elif not gene_prefix:
                    geno.addGene(gene_curie, gene_label)
                    geno.addTaxon(self.globaltt['Drosophila melanogaster'],
                                  allele_curie)
                else:
                    raise ValueError(
                        "Did not correct parse gene label {}".format(
                            gene_label))

                # Connect allele and gene with geno.addAffectedLocus()
                if allele_prefix and gene_prefix:
                    if allele_prefix[0] == gene_prefix[0]:
                        geno.addAffectedLocus(allele_curie, gene_curie)
                    else:
                        raise ValueError(
                            "Found allele and gene with different "
                            "prefixes: {}, {}".format(allele_id, gene_id))
                elif not allele_prefix and gene_prefix:
                    raise ValueError("Found allele and gene with different "
                                     "prefixes: {}, {}".format(
                                         allele_id, gene_id))
                else:
                    # Both are melanogaster
                    geno.addAffectedLocus(allele_curie, gene_curie)

                if limit is not None and reader.line_num > limit:
                    break
Пример #42
0
    def _process_genes(self, taxid, limit=None):
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)

        raw = '/'.join((self.rawdir, self.files[taxid]['file']))
        line_counter = 0
        LOG.info("Processing Ensembl genes for tax %s", taxid)
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t')
            for row in filereader:
                if len(row) < 4:
                    LOG.warning("Too few columns in: " + row)
                    raise ValueError("Data error for file %s", raw)
                (ensembl_gene_id, external_gene_name, description, gene_biotype,
                 entrezgene, ensembl_peptide_id, uniprotswissprot) = row[0:7]

                # in the case of human genes, we also get the hgnc id,
                # and is the last col
                if taxid == '9606':
                    hgnc_id = row[7]
                else:
                    hgnc_id = None

                if self.test_mode and entrezgene != '' and \
                        int(entrezgene) not in self.gene_ids:
                    continue

                line_counter += 1
                gene_id = 'ENSEMBL:' + ensembl_gene_id
                peptide_curie = 'ENSEMBL:{}'.format(ensembl_peptide_id)
                uniprot_curie = 'UniProtKB:{}'.format(uniprotswissprot)
                entrez_curie = 'NCBIGene:{}'.format(entrezgene)

                if description == '':
                    description = None
                gene_biotype = gene_biotype.strip()
                gene_type_id = self.resolve(gene_biotype, False)
                if gene_type_id == gene_biotype.strip():   # did not resolve
                    gene_type_id = self.globaltt['polypeptide']

                model.addClassToGraph(
                    gene_id, external_gene_name, gene_type_id, description)
                model.addIndividualToGraph(peptide_curie, None, gene_type_id)
                model.addIndividualToGraph(uniprot_curie, None, gene_type_id)

                if entrezgene != '':
                    if taxid == '9606':
                        # Use HGNC for eq in human data
                        model.addXref(gene_id, entrez_curie)
                    else:
                        model.addEquivalentClass(gene_id, entrez_curie)
                if hgnc_id is not None and hgnc_id != '':
                    model.addEquivalentClass(gene_id, hgnc_id)
                geno.addTaxon('NCBITaxon:'+taxid, gene_id)
                if ensembl_peptide_id != '':
                    geno.addGeneProduct(gene_id, peptide_curie)
                    if uniprotswissprot != '':
                        geno.addGeneProduct(gene_id, uniprot_curie)
                        model.addXref(peptide_curie, uniprot_curie)

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        return
Пример #43
0
    def _process_genes(self, limit=None):

        if self.testMode:
            graph = self.testgraph
        else:
            graph = self.graph

        geno = Genotype(graph)
        model = Model(graph)
        raw = '/'.join((self.rawdir, self.files['genes']['file']))
        line_counter = 0
        logger.info("Processing HGNC genes")

        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            # curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n  .
            for row in filereader:
                (hgnc_id, symbol, name, locus_group, locus_type, status,
                 location, location_sortable, alias_symbol, alias_name,
                 prev_symbol, prev_name, gene_family, gene_family_id,
                 date_approved_reserved, date_symbol_changed,
                 date_name_changed, date_modified, entrez_id, ensembl_gene_id,
                 vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids,
                 pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase,
                 homeodb, snornabase, bioparadigms_slc, orphanet,
                 pseudogene_org, horde_id, merops, imgt, iuphar,
                 kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id,
                 intermediate_filament_db, rna_central_ids) = row

                line_counter += 1

                # skip header
                if line_counter <= 1:
                    continue

                if self.testMode and entrez_id != ''  and \
                        int(entrez_id) not in self.gene_ids:
                    continue

                if name == '':
                    name = None
                gene_type_id = self.resolve(locus_type,
                                            False)  # withdrawn -> None?
                if gene_type_id != locus_type:
                    model.addClassToGraph(hgnc_id, symbol, gene_type_id, name)
                if locus_type == 'withdrawn':
                    model.addDeprecatedClass(hgnc_id)
                else:
                    model.makeLeader(hgnc_id)
                if entrez_id != '':
                    model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id)
                if ensembl_gene_id != '':
                    model.addEquivalentClass(hgnc_id,
                                             'ENSEMBL:' + ensembl_gene_id)
                if omim_id != '' and "|" not in omim_id:
                    omim_curie = 'OMIM:' + omim_id
                    if not DipperUtil.is_omim_disease(omim_curie):
                        model.addEquivalentClass(hgnc_id, omim_curie)

                geno.addTaxon(self.hs_txid, hgnc_id)

                # add pubs as "is about"
                if pubmed_id != '':
                    for p in re.split(r'\|', pubmed_id.strip()):
                        if str(p) != '':
                            graph.addTriple('PMID:' + str(p.strip()),
                                            self.globaltt['is_about'], hgnc_id)

                # add chr location
                # sometimes two are listed, like: 10p11.2 or 17q25
                # -- there are only 2 of these FRA10A and MPFD
                # sometimes listed like "1 not on reference assembly"
                # sometimes listed like 10q24.1-q24.3
                # sometimes like 11q11 alternate reference locus
                band = chrom = None
                chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]'
                chr_match = re.match(chr_pattern, location)
                if chr_match is not None and len(chr_match.groups()) > 0:
                    chrom = chr_match.group(1)
                    chrom_id = makeChromID(chrom, self.hs_txid, 'CHR')
                    band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)'
                    band_match = re.search(band_pattern, location)
                    feat = Feature(graph, hgnc_id, None, None)
                    if band_match is not None and len(band_match.groups()) > 0:
                        band = band_match.group(1)
                        band = chrom + band
                        # add the chr band as the parent to this gene
                        # as a feature but assume that the band is created
                        # as a class with properties elsewhere in Monochrom
                        band_id = makeChromID(band, self.hs_txid, 'CHR')
                        model.addClassToGraph(band_id, None)
                        feat.addSubsequenceOfFeature(band_id)
                    else:
                        model.addClassToGraph(chrom_id, None)
                        feat.addSubsequenceOfFeature(chrom_id)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

            # end loop through file

        return
Пример #44
0
    def _get_gene_info(self, limit):
        """
        Currently loops through the gene_info file and
        creates the genes as classes, typed with SO.  It will add their label,
        any alternate labels as synonyms, alternate ids as equivlaent classes.
        HPRDs get added as protein products.
        The chromosome and chr band get added as blank node regions,
        and the gene is faldo:located
        on the chr band.
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)
        model = Model(g)

        # not unzipping the file
        logger.info("Processing 'Gene Info' records")
        line_counter = 0
        gene_info = '/'.join((self.rawdir, self.files['gene_info']['file']))
        logger.info("FILE: %s", gene_info)
        # Add taxa and genome classes for those in our filter
        for tax_num in self.tax_ids:
            tax_id = ':'.join(('NCBITaxon', str(tax_num)))
            # tax label can get added elsewhere
            geno.addGenome(tax_id, str(tax_num))
            # label added elsewhere
            model.addClassToGraph(tax_id, None)
        with gzip.open(gene_info, 'rb') as f:
            row = f.readline().decode().strip().split('\t')
            logger.info("Header has %i columns", len(row))
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chrom,
                 map_loc, desc, gtype, authority_symbol, name,
                 nomenclature_status, other_designations,
                 modification_date, feature_type) = line.split('\t')

                # ##set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #           or (self.filter == 'geneids' and \
                #               (int(gene_num) not in self.gene_ids))):
                #         continue
                # #### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1

                gene_id = ':'.join(('NCBIGene', gene_num))
                tax_id = ':'.join(('NCBITaxon', tax_num))
                gene_type_id = self.map_type_of_gene(gtype.strip())

                if symbol == 'NEWENTRY':
                    label = None
                else:
                    label = symbol
                # sequence feature, not a gene
                if gene_type_id == 'SO:0000110':
                    self.class_or_indiv[gene_id] = 'I'
                else:
                    self.class_or_indiv[gene_id] = 'C'

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    continue

                if self.class_or_indiv[gene_id] == 'C':
                    model.addClassToGraph(gene_id, label, gene_type_id, desc)
                    # NCBI will be the default leader,
                    # so we will not add the leader designation here.
                else:
                    model.addIndividualToGraph(
                        gene_id, label, gene_type_id, desc)
                    # in this case, they aren't genes.
                    # so we want someone else to be the leader.

                if name != '-':
                    model.addSynonym(gene_id, name)
                if synonyms.strip() != '-':
                    for s in synonyms.split('|'):
                        model.addSynonym(
                            gene_id, s.strip(),
                            Assoc.annotation_properties['hasRelatedSynonym'])
                if other_designations.strip() != '-':
                    for s in other_designations.split('|'):
                        model.addSynonym(
                            gene_id, s.strip(),
                            Assoc.annotation_properties['hasRelatedSynonym'])
                if xrefs.strip() != '-':
                    self._add_gene_equivalencies(xrefs, gene_id, tax_num)

                # edge cases of id | symbol | chr | map_loc:
                # 263     AMD1P2    X|Y  with   Xq28 and Yq12
                # 438     ASMT      X|Y  with   Xp22.3 or Yp11.3    # in PAR
                # no idea why there's two bands listed - possibly 2 assemblies
                # 419     ART3      4    with   4q21.1|4p15.1-p14
                # 28227   PPP2R3B   X|Y  Xp22.33; Yp11.3            # in PAR
                # this is of "unknown" type == susceptibility
                # 619538  OMS     10|19|3 10q26.3;19q13.42-q13.43;3p25.3
                # unlocated scaffold
                # 101928066       LOC101928066    1|Un    -\
                # mouse --> 2C3
                # 11435   Chrna1  2       2 C3|2 43.76 cM
                # mouse --> 11B1.1
                # 11548   Adra1b  11      11 B1.1|11 25.81 cM
                # 11717   Ampd3   7       7 57.85 cM|7 E2-E3        # mouse
                # 14421   B4galnt1        10      10 D3|10 74.5 cM  # mouse
                # 323212  wu:fb92e12      19|20   -                 # fish
                # 323368  ints10  6|18    -                         # fish
                # 323666  wu:fc06e02      11|23   -                 # fish

                # feel that the chr placement can't be trusted in this table
                # when there is > 1 listed
                # with the exception of human X|Y,
                # we will only take those that align to one chr

                # FIXME remove the chr mapping below
                # when we pull in the genomic coords
                if str(chrom) != '-' and str(chrom) != '':
                    if re.search(r'\|', str(chrom)) and \
                            str(chrom) not in ['X|Y', 'X; Y']:
                        # means that there's uncertainty in the mapping.
                        # so skip it
                        # TODO we'll need to figure out how to deal with
                        # >1 loc mapping
                        logger.info(
                            '%s is non-uniquely mapped to %s.' +
                            ' Skipping for now.',
                            gene_id, str(chr))
                        continue
                        # X|Y	Xp22.33;Yp11.3

                    # if(not re.match(
                    #        r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())):
                    #    print('odd chr=',str(chr))
                    if str(chrom) == 'X; Y':
                        chrom = 'X|Y'  # rewrite the PAR regions for processing
                    # do this in a loop to allow PAR regions like X|Y
                    for c in re.split(r'\|', str(chrom)):
                        # assume that the chromosome label is added elsewhere
                        geno.addChromosomeClass(c, tax_id, None)
                        mychrom = makeChromID(c, tax_num, 'CHR')
                        # temporarily use taxnum for the disambiguating label
                        mychrom_syn = makeChromLabel(c, tax_num)
                        model.addSynonym(mychrom, mychrom_syn)
                        band_match = re.match(
                            r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc)
                        if band_match is not None and \
                                len(band_match.groups()) > 0:
                            # if tax_num != '9606':
                            #     continue
                            # this matches the regular kind of chrs,
                            # so make that kind of band
                            # not sure why this matches?
                            #   chrX|Y or 10090chr12|Un"
                            # TODO we probably need a different regex
                            # per organism
                            # the maploc_id already has the numeric chromosome
                            # in it, strip it first
                            bid = re.sub(r'^'+c, '', map_loc)
                            # the generic location (no coordinates)
                            maploc_id = makeChromID(c+bid, tax_num, 'CHR')
                            # print(map_loc,'-->',bid,'-->',maploc_id)
                            # Assume it's type will be added elsewhere
                            band = Feature(g, maploc_id, None, None)
                            band.addFeatureToGraph()
                            # add the band as the containing feature
                            g.addTriple(
                                gene_id,
                                Feature.object_properties['is_subsequence_of'],
                                maploc_id)
                        else:
                            # TODO handle these cases: examples are:
                            # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24,
                            # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1,
                            # 12cen-q21,22q13.3|22q13.3
                            logger.debug(
                                'not regular band pattern for %s: %s',
                                gene_id, map_loc)
                            # add the gene as a subsequence of the chromosome
                            g.addTriple(
                                gene_id,
                                Feature.object_properties['is_subsequence_of'],
                                mychrom)

                geno.addTaxon(tax_id, gene_id)

        return
Пример #45
0
    def _process_data(self, raw, limit=None):
        LOG.info("Processing Data from %s", raw)

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        geno = Genotype(graph)

        # Add the taxon as a class
        taxon_id = self.globaltt['Mus musculus']
        model.addClassToGraph(taxon_id, None)

        # with open(raw, 'r', encoding="utf8") as csvfile:
        col = self.files['all']['columns']
        with gzip.open(raw, 'rt') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            row = next(reader)  # presumed header
            if not self.check_fileheader(col, row):
                pass

            for row in reader:
                # | head -1 | tr ',' '\n' | sed "s|\(.*\)|# \1 = row[col.index('\1')]|g"
                marker_accession_id = row[col.index('marker_accession_id')].strip()
                marker_symbol = row[col.index('marker_symbol')].strip()
                phenotyping_center = row[col.index('phenotyping_center')].strip()
                colony_raw = row[col.index('colony_id')].strip()
                sex = row[col.index('sex')].strip()
                zygosity = row[col.index('zygosity')].strip()
                allele_accession_id = row[col.index('allele_accession_id')].strip()
                allele_symbol = row[col.index('allele_symbol')].strip()
                # allele_name = row[col.index('allele_name')]
                strain_accession_id = row[col.index('strain_accession_id')].strip()
                strain_name = row[col.index('strain_name')].strip()
                # project_name = row[col.index('project_name')]
                project_fullname = row[col.index('project_fullname')].strip()
                pipeline_name = row[col.index('pipeline_name')].strip()
                pipeline_stable_id = row[col.index('pipeline_stable_id')].strip()
                procedure_stable_id = row[col.index('procedure_stable_id')].strip()
                procedure_name = row[col.index('procedure_name')].strip()
                parameter_stable_id = row[col.index('parameter_stable_id')].strip()
                parameter_name = row[col.index('parameter_name')].strip()
                # top_level_mp_term_id = row[col.index('top_level_mp_term_id')]
                # top_level_mp_term_name = row[col.index('top_level_mp_term_name')]
                mp_term_id = row[col.index('mp_term_id')].strip()
                mp_term_name = row[col.index('mp_term_name')].strip()
                p_value = row[col.index('p_value')].strip()
                percentage_change = row[col.index('percentage_change')].strip()
                effect_size = row[col.index('effect_size')].strip()
                statistical_method = row[col.index('statistical_method')].strip()
                resource_name = row[col.index('resource_name')].strip()

                if self.test_mode and marker_accession_id not in self.gene_ids:
                    continue

                # ##### cleanup some of the identifiers ######
                zygosity = zygosity.strip()
                zygosity_id = self.resolve(zygosity)
                if zygosity_id == zygosity:
                    LOG.warning(
                        "Zygosity '%s' unmapped. detting to indeterminate", zygosity)
                    zygosity_id = self.globaltt['indeterminate']

                # colony ids sometimes have <> in them, spaces,
                # or other non-alphanumerics and break our system;
                # replace these with underscores
                colony_id = '_:' + re.sub(r'\W+', '_', colony_raw)

                if not re.match(r'MGI', allele_accession_id):
                    allele_accession_id = '_:IMPC-'+re.sub(
                        r':', '', allele_accession_id)

                if re.search(r'EUROCURATE', strain_accession_id):
                    # the eurocurate links don't resolve at IMPC
                    # TODO blank nodes do not maintain identifiers
                    strain_accession_id = '_:' + strain_accession_id

                elif not re.match(r'MGI', strain_accession_id):
                    LOG.info(
                        "Found a strange strain accession...%s", strain_accession_id)
                    strain_accession_id = 'IMPC:'+strain_accession_id

                ######################
                # first, add the marker and variant to the graph as with MGI,
                # the allele is the variant locus.  IF the marker is not known,
                # we will call it a sequence alteration.  otherwise,
                # we will create a BNode for the sequence alteration.
                sequence_alteration_id = variant_locus_id = None
                variant_locus_name = sequence_alteration_name = None

                # extract out what's within the <> to get the symbol
                if re.match(r'.*<.*>', allele_symbol):
                    sequence_alteration_name = re.match(
                        r'.*<(.*)>', allele_symbol)
                    if sequence_alteration_name is not None:
                        sequence_alteration_name = sequence_alteration_name.group(1)
                else:
                    sequence_alteration_name = allele_symbol

                if marker_accession_id is not None and marker_accession_id == '':
                    LOG.warning("Marker unspecified on row %d", reader.line_num)
                    marker_accession_id = None

                if marker_accession_id is not None:
                    variant_locus_id = allele_accession_id
                    variant_locus_name = allele_symbol
                    variant_locus_type = self.globaltt['variant_locus']
                    geno.addGene(
                        marker_accession_id, marker_symbol, self.globaltt['gene'])

                    geno.addAllele(
                        variant_locus_id, variant_locus_name, variant_locus_type, None)
                    geno.addAlleleOfGene(variant_locus_id, marker_accession_id)

                    # TAG bnode
                    sequence_alteration_id = '_:seqalt' + re.sub(
                        r':', '', allele_accession_id)
                    geno.addSequenceAlterationToVariantLocus(
                        sequence_alteration_id, variant_locus_id)

                else:
                    sequence_alteration_id = allele_accession_id

                # IMPC contains targeted mutations with either gene traps,
                # knockouts, insertion/intragenic deletions.
                # but I don't really know what the SeqAlt is here,
                # so I don't add it.
                geno.addSequenceAlteration(
                    sequence_alteration_id, sequence_alteration_name)

                # #############    BUILD THE COLONY    #############
                # First, let's describe the colony that the animals come from
                # The Colony ID refers to the ES cell clone
                #   used to generate a mouse strain.
                # Terry sez: we use this clone ID to track
                #   ES cell -> mouse strain -> mouse phenotyping.
                # The same ES clone maybe used at multiple centers,
                # so we have to concatenate the two to have a unique ID.
                # some useful reading about generating mice from ES cells:
                # http://ki.mit.edu/sbc/escell/services/details

                # here, we'll make a genotype
                # that derives from an ES cell with a given allele.
                # the strain is not really attached to the colony.

                # the colony/clone is reflective of the allele,  with unknown zygosity

                stem_cell_class = self.globaltt['embryonic stem cell line']

                if colony_id is None:
                    print(colony_raw, stem_cell_class, "\nline:\t", reader.line_num)
                model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class)

                # vslc of the colony has unknown zygosity
                # note that we will define the allele
                # (and it's relationship to the marker, etc.) later
                # FIXME is it really necessary to create this vslc
                # when we always know it's unknown zygosity?
                vslc_colony = '_:'+re.sub(
                    r':', '', allele_accession_id + self.globaltt['indeterminate'])
                vslc_colony_label = allele_symbol + '/<?>'
                # for ease of reading, we make the colony genotype variables.
                # in the future, it might be desired to keep the vslcs
                colony_genotype_id = vslc_colony
                colony_genotype_label = vslc_colony_label
                geno.addGenotype(colony_genotype_id, colony_genotype_label)
                geno.addParts(
                    allele_accession_id, colony_genotype_id,
                    self.globaltt['has_variant_part'])

                geno.addPartsToVSLC(
                    vslc_colony, allele_accession_id, None,
                    self.globaltt['indeterminate'], self.globaltt['has_variant_part'])
                graph.addTriple(
                    colony_id, self.globaltt['has_genotype'], colony_genotype_id)

                # ##########    BUILD THE ANNOTATED GENOTYPE    ##########
                # now, we'll build the genotype of the individual that derives
                # from the colony/clone genotype that is attached to
                # phenotype = colony_id + strain + zygosity + sex
                # (and is derived from a colony)

                # this is a sex-agnostic genotype
                genotype_id = self.make_id(
                    (colony_id + phenotyping_center + zygosity + strain_accession_id))
                geno.addSequenceDerivesFrom(genotype_id, colony_id)

                # build the VSLC of the sex-agnostic genotype
                # based on the zygosity
                allele1_id = allele_accession_id
                allele2_id = allele2_rel = None
                allele1_label = allele_symbol
                allele2_label = '<?>'
                # Making VSLC labels from the various parts,
                # can change later if desired.
                if zygosity == 'heterozygote':
                    allele2_label = re.sub(r'<.*', '<+>', allele1_label)
                    allele2_id = None
                elif zygosity == 'homozygote':
                    allele2_label = allele1_label
                    allele2_id = allele1_id
                    allele2_rel = self.globaltt['has_variant_part']
                elif zygosity == 'hemizygote':
                    allele2_label = re.sub(r'<.*', '<0>', allele1_label)
                    allele2_id = None
                elif zygosity == 'not_applicable':
                    allele2_label = re.sub(r'<.*', '<?>', allele1_label)
                    allele2_id = None
                else:
                    LOG.warning("found unknown zygosity %s", zygosity)
                    break
                vslc_name = '/'.join((allele1_label, allele2_label))

                # Add the VSLC
                vslc_id = '-'.join(
                    (marker_accession_id, allele_accession_id, zygosity))
                vslc_id = re.sub(r':', '', vslc_id)
                vslc_id = '_:'+vslc_id
                model.addIndividualToGraph(
                    vslc_id, vslc_name,
                    self.globaltt['variant single locus complement'])
                geno.addPartsToVSLC(
                    vslc_id, allele1_id, allele2_id, zygosity_id,
                    self.globaltt['has_variant_part'], allele2_rel)

                # add vslc to genotype
                geno.addVSLCtoParent(vslc_id, genotype_id)

                # note that the vslc is also the gvc
                model.addType(vslc_id, self.globaltt['genomic_variation_complement'])

                # Add the genomic background
                # create the genomic background id and name
                if strain_accession_id != '':
                    genomic_background_id = strain_accession_id
                else:
                    genomic_background_id = None

                genotype_name = vslc_name
                if genomic_background_id is not None:
                    geno.addGenotype(
                        genomic_background_id, strain_name,
                        self.globaltt['genomic_background'])

                    # make a phenotyping-center-specific strain
                    # to use as the background
                    pheno_center_strain_label = strain_name + '-' + phenotyping_center \
                        + '-' + colony_raw
                    pheno_center_strain_id = '-'.join((
                        re.sub(r':', '', genomic_background_id),
                        re.sub(r'\s', '_', phenotyping_center),
                        re.sub(r'\W+', '', colony_raw)))
                    if not re.match(r'^_', pheno_center_strain_id):
                        # Tag bnode
                        pheno_center_strain_id = '_:' + pheno_center_strain_id

                    geno.addGenotype(
                        pheno_center_strain_id, pheno_center_strain_label,
                        self.globaltt['genomic_background'])
                    geno.addSequenceDerivesFrom(
                        pheno_center_strain_id, genomic_background_id)

                    # Making genotype labels from the various parts,
                    # can change later if desired.
                    # since the genotype is reflective of the place
                    # it got made, should put that in to disambiguate
                    genotype_name = \
                        genotype_name + ' [' + pheno_center_strain_label + ']'
                    geno.addGenomicBackgroundToGenotype(
                        pheno_center_strain_id, genotype_id)
                    geno.addTaxon(taxon_id, pheno_center_strain_id)
                # this is redundant, but i'll keep in in for now
                geno.addSequenceDerivesFrom(genotype_id, colony_id)
                geno.addGenotype(genotype_id, genotype_name)

                # Make the sex-qualified genotype,
                # which is what the phenotype is associated with
                sex_qualified_genotype_id = \
                    self.make_id((
                        colony_id + phenotyping_center + zygosity +
                        strain_accession_id + sex))
                sex_qualified_genotype_label = genotype_name + ' (' + sex + ')'

                sq_type_id = self.resolve(sex, False)

                if sq_type_id == sex:
                    sq_type_id = self.globaltt['intrinsic_genotype']
                    LOG.warning(
                        "Unknown sex qualifier %s, adding as intrinsic_genotype",
                        sex)

                geno.addGenotype(
                    sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id)
                geno.addParts(
                    genotype_id, sex_qualified_genotype_id,
                    self.globaltt['has_variant_part'])

                if genomic_background_id is not None and genomic_background_id != '':
                    # Add the taxon to the genomic_background_id
                    geno.addTaxon(taxon_id, genomic_background_id)
                else:
                    # add it as the genomic background
                    geno.addTaxon(taxon_id, genotype_id)

                # #############    BUILD THE G2P ASSOC    #############
                # from an old email dated July 23 2014:
                # Phenotypes associations are made to
                # imits colony_id+center+zygosity+gender

                # sometimes phenotype ids are missing.  (about 711 early 2020)
                if mp_term_id is None or mp_term_id == '':
                    LOG.warning(
                        "No phenotype id specified for row %d", reader.line_num)
                    continue
                # hard coded ECO code
                eco_id = self.globaltt['mutant phenotype evidence']

                # the association comes as a result of a g2p from
                # a procedure in a pipeline at a center and parameter tested

                assoc = G2PAssoc(
                    graph, self.name, sex_qualified_genotype_id, mp_term_id)
                assoc.add_evidence(eco_id)
                # assoc.set_score(float(p_value))

                # TODO add evidence instance using
                # pipeline_stable_id +
                # procedure_stable_id +
                # parameter_stable_id

                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()

                model._addSexSpecificity(assoc_id, self.resolve(sex))

                # add a free-text description
                try:
                    description = ' '.join((
                        mp_term_name, 'phenotype determined by', phenotyping_center,
                        'in an', procedure_name, 'assay where', parameter_name.strip(),
                        'was measured with an effect_size of',
                        str(round(float(effect_size), 5)),
                        '(p =', "{:.4e}".format(float(p_value)), ').'))
                except ValueError:
                    description = ' '.join((
                        mp_term_name, 'phenotype determined by', phenotyping_center,
                        'in an', procedure_name, 'assay where', parameter_name.strip(),
                        'was measured with an effect_size of', str(effect_size),
                        '(p =', "{0}".format(p_value), ').'))

                study_bnode = self._add_study_provenance(
                    phenotyping_center, colony_raw, project_fullname, pipeline_name,
                    pipeline_stable_id, procedure_stable_id, procedure_name,
                    parameter_stable_id, parameter_name, statistical_method,
                    resource_name)

                evidence_line_bnode = self._add_evidence(
                    assoc_id, eco_id, p_value, percentage_change, effect_size,
                    study_bnode)

                self._add_assertion_provenance(assoc_id, evidence_line_bnode)

                model.addDescription(evidence_line_bnode, description)

                # resource_id = resource_name
                # assoc.addSource(graph, assoc_id, resource_id)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
Пример #46
0
    def add_orthologs_by_gene_group(self, graph, gene_ids):
        """
        This will get orthologies between human and other vertebrate genomes
        based on the gene_group annotation pipeline from NCBI.
        More information 9can be learned here:
        http://www.ncbi.nlm.nih.gov/news/03-13-2014-gene-provides-orthologs-regions/
        The method for associations is described in
        [PMCID:3882889](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3882889/)
        == [PMID:24063302](http://www.ncbi.nlm.nih.gov/pubmed/24063302/).
        Because these are only between human and vertebrate genomes,
        they will certainly miss out on very distant orthologies,
        and should not be considered complete.

        We do not run this within the NCBI parser itself;
        rather it is a convenience function for others parsers to call.

        :param graph:
        :param gene_ids:  Gene ids to fetch the orthology
        :return:

        """

        logger.info("getting gene groups")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['gene_group']['file']))
        found_counter = 0
        # because many of the orthologous groups are grouped by human gene,
        # we need to do this by generating two-way hash

        # group_id => orthologs
        # ortholog id => group
        # this will be the fastest approach, though not memory-efficient.
        geno = Genotype(graph)
        model = Model(graph)
        group_to_orthology = {}
        gene_to_group = {}
        gene_to_taxon = {}

        with gzip.open(f, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""),
                delimiter='\t',
                quotechar='\"')

            for row in filereader:
                # skip comment lines
                if re.match(r'\#', ''.join(row)):
                    continue
                line_counter += 1
                (tax_a, gene_a, rel, tax_b, gene_b) = row

                if rel != 'Ortholog':
                    continue

                if gene_a not in group_to_orthology:
                    group_to_orthology[gene_a] = set()
                group_to_orthology[gene_a].add(gene_b)

                if gene_b not in gene_to_group:
                    gene_to_group[gene_b] = set()
                gene_to_group[gene_b].add(gene_a)

                gene_to_taxon[gene_a] = tax_a
                gene_to_taxon[gene_b] = tax_b

                # also add the group lead as a member of the group
                group_to_orthology[gene_a].add(gene_a)

            # end loop through gene_group file
        logger.debug("Finished hashing gene groups")
        logger.debug("Making orthology associations")
        for gid in gene_ids:
            gene_num = re.sub(r'NCBIGene:', '', gid)
            group_nums = gene_to_group.get(gene_num)
            if group_nums is not None:
                for group_num in group_nums:
                    orthologs = group_to_orthology.get(group_num)
                    if orthologs is not None:
                        for o in orthologs:
                            oid = 'NCBIGene:'+str(o)
                            model.addClassToGraph(
                                oid, None, Genotype.genoparts['gene'])
                            otaxid = 'NCBITaxon:'+str(gene_to_taxon[o])
                            geno.addTaxon(otaxid, oid)
                            assoc = OrthologyAssoc(graph, self.name, gid, oid)
                            assoc.add_source('PMID:24063302')
                            assoc.add_association_to_graph()
                            # todo get gene label for orthologs -
                            # this could get expensive
                            found_counter += 1

            # finish loop through annotated genes
        logger.info(
            "Made %d orthology relationships for %d genes",
            found_counter, len(gene_ids))
        return
Пример #47
0
    def _process_haplotype(
            self, hap_id, hap_label, chrom_num, chrom_pos, context,
            risk_allele_frequency, mapped_gene, so_ontology):
        tax_id = 'NCBITaxon:9606'

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        geno = Genotype(g)
        model = Model(g)
        # add the feature to the graph
        hap_description = None
        if risk_allele_frequency != '' and \
                risk_allele_frequency != 'NR':
            hap_description = \
                str(risk_allele_frequency) + \
                ' [risk allele frequency]'

        model.addIndividualToGraph(hap_id, hap_label.strip(),
                                   Feature.types['haplotype'], hap_description)
        geno.addTaxon(tax_id, hap_id)

        snp_labels = re.split(r';\s?', hap_label)
        chrom_nums = re.split(r';\s?', chrom_num)
        chrom_positions = re.split(r';\s?', chrom_pos)
        context_list = re.split(r';\s?', context)
        mapped_genes = re.split(r';\s?', mapped_gene)
        snp_curies = list()

        for index, snp in enumerate(snp_labels):
            snp_curie, snp_type = self._get_curie_and_type_from_id(snp)
            if snp_type is None:
                # make blank node
                snp_curie = self.make_id(snp, "_")

            g.addTriple(hap_id, geno.object_properties['has_variant_part'],
                        snp_curie)
            snp_curies.append(snp_curie)

        # courtesy http://stackoverflow.com/a/16720915
        length = len(snp_labels)
        if not all(len(lst) == length
                   for lst in [chrom_nums, chrom_positions, context_list]):
            logger.warn(
                "Unexpected data field for haplotype {} \n "
                "will not add snp details".format(hap_label))
            return

        variant_in_gene_count = 0
        for index, snp_curie in enumerate(snp_curies):
            self._add_snp_to_graph(
                snp_curie, snp_labels[index], chrom_nums[index],
                chrom_positions[index], context_list[index])

            if len(mapped_genes) == len(snp_labels):

                so_class = self._map_variant_type(context_list[index])

                if so_class is None:
                    raise ValueError("Unknown SO class {} in haplotype {}"
                                     .format(context_list[index], hap_label))
                so_query = """
                    SELECT ?variant_label
                    WHERE {{
                        {0} rdfs:subClassOf+ SO:0001564 ;
                            rdfs:label ?variant_label .
                    }}
                """.format(so_class)

                query_result = so_ontology.query(so_query)
                if len(list(query_result)) > 0:
                    gene_id = DipperUtil.get_ncbi_id_from_symbol(
                        mapped_genes[index])
                    if gene_id is not None:
                        geno.addAffectedLocus(snp_curie, gene_id)
                        geno.addAffectedLocus(hap_id, gene_id)
                        variant_in_gene_count += 1

                if context_list[index] == 'upstream_gene_variant':
                    gene_id = DipperUtil.get_ncbi_id_from_symbol(
                        mapped_genes[index])
                    if gene_id is not None:
                        g.addTriple(
                            snp_curie,
                            Feature.object_properties[
                                'upstream_of_sequence_of'],
                            gene_id)
                elif context_list[index] == 'downstream_gene_variant':
                    gene_id = DipperUtil.get_ncbi_id_from_symbol(
                        mapped_genes[index])
                    if gene_id is not None:
                        g.addTriple(
                            snp_curie,
                            Feature.object_properties[
                                'downstream_of_sequence_of'],
                            gene_id)
            else:
                logger.warn("More mapped genes than snps, "
                            "cannot disambiguate for {}".format(hap_label))

        # Seperate in case we want to apply a different relation
        # If not this is redundant with triples added above
        if len(mapped_genes) == variant_in_gene_count \
                and len(set(mapped_genes)) == 1:
            gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[0])
            geno.addAffectedLocus(hap_id, gene_id)

        return
Пример #48
0
    def _process_genes(self, taxid, limit=None):
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        geno = Genotype(g)

        raw = '/'.join((self.rawdir, self.files[taxid]['file']))
        line_counter = 0
        logger.info("Processing Ensembl genes for tax %s", taxid)
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t')
            for row in filereader:
                if len(row) < 4:
                    raise ValueError("Data error for file %s", raw)
                (ensembl_gene_id, external_gene_name, description,
                 gene_biotype, entrezgene, peptide_id,
                 uniprot_swissprot) = row[0:7]

                # in the case of human genes, we also get the hgnc id,
                # and is the last col
                if taxid == '9606':
                    hgnc_id = row[7]
                else:
                    hgnc_id = None

                if self.testMode and entrezgene != '' \
                        and int(entrezgene) not in self.gene_ids:
                    continue

                line_counter += 1
                gene_id = 'ENSEMBL:' + ensembl_gene_id
                peptide_curie = 'ENSEMBL:{}'.format(peptide_id)
                uniprot_curie = 'UniProtKB:{}'.format(uniprot_swissprot)
                entrez_curie = 'NCBIGene:{}'.format(entrezgene)

                if description == '':
                    description = None
                # gene_type_id = self._get_gene_type(gene_biotype)
                gene_type_id = None
                model.addClassToGraph(gene_id, external_gene_name,
                                      gene_type_id, description)
                model.addIndividualToGraph(peptide_curie, None,
                                           self._get_gene_type("polypeptide"))
                model.addIndividualToGraph(uniprot_curie, None,
                                           self._get_gene_type("polypeptide"))

                if entrezgene != '':
                    model.addEquivalentClass(gene_id, entrez_curie)
                if hgnc_id is not None and hgnc_id != '':
                    model.addEquivalentClass(gene_id, hgnc_id)
                geno.addTaxon('NCBITaxon:' + taxid, gene_id)
                if peptide_id != '':
                    geno.addGeneProduct(gene_id, peptide_curie)
                    if uniprot_swissprot != '':
                        geno.addGeneProduct(gene_id, uniprot_curie)
                        model.addXref(peptide_curie, uniprot_curie)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return