Exemplo n.º 1
0
    def _process_phene_gene_row(self, row):
        geno = Genotype(self.graph)
        model = Model(self.graph)
        gene_id = self.id_hash['gene'].get(row['gene_id'])
        phene_id = self.id_hash['phene'].get(row['phene_id'])

        omia_id = self._get_omia_id_from_phene_id(phene_id)

        if self.test_mode and not (omia_id in self.test_ids['disease']
                                   and row['gene_id'] in self.test_ids['gene']
                                   ) or gene_id is None or phene_id is None:
            return

        # occasionally some phenes are missing!  (ex: 406)
        if phene_id is None:
            LOG.warning("Phene id %s is missing", str(row['phene_id']))
            return

        gene_label = self.label_hash[gene_id]
        # some variant of gene_id has phenotype d
        var = self.make_id(gene_id.split(':')[-1] + 'VL', '_')
        geno.addAllele(var, 'some variant of ' + gene_label)
        geno.addAlleleOfGene(var, gene_id)
        geno.addAffectedLocus(var, gene_id)
        model.addBlankNodeAnnotation(var)
        assoc = G2PAssoc(self.graph, self.name, var, phene_id)
        assoc.add_association_to_graph()

        # add the gene id to the set of annotated genes
        # for later lookup by orthology
        self.annotated_genes.add(gene_id)
Exemplo n.º 2
0
    def parse(self, limit=None):
        """
        Override Source.parse()
        Parses version and interaction information from CTD
        Args:
        :param limit (int, optional) limit the number of rows processed
        Returns:
        :return None
        """
        if limit is not None:
            LOG.info("Only parsing first %d rows", limit)

        LOG.info("Parsing files...")

        if self.test_only:
            self.test_mode = True

        self.geno = Genotype(self.graph)
        self.pathway = Pathway(self.graph)

        src_key = 'chemical_disease_associations'
        self._parse_ctd_file(limit, src_key)

        # self._parse_ctd_file(limit, 'gene_pathway')
        # self._parse_ctd_file(limit, 'gene_disease')

        src_key = 'publications'
        file_path = '/'.join((self.rawdir, self.api_fetch[src_key]['file']))
        if os.path.exists(file_path) is True:
            self._parse_curated_chem_disease(file_path, limit)
        else:
            LOG.error('Batch Query file "%s" does not exist', file_path)
        LOG.info("Done parsing files.")
Exemplo n.º 3
0
    def _process_phene_gene_row(self, row):
        geno = Genotype(self.g)
        model = Model(self.g)
        gene_id = self.id_hash['gene'].get(row['gene_id'])
        phene_id = self.id_hash['phene'].get(row['phene_id'])

        omia_id = self._get_omia_id_from_phene_id(phene_id)

        if self.testMode and not (
                omia_id in self.test_ids['disease'] and
                row['gene_id'] in self.test_ids['gene']) or\
                gene_id is None or phene_id is None:
            return

        # occasionally some phenes are missing!  (ex: 406)
        if phene_id is None:
            logger.warning("Phene id %s is missing", str(row['phene_id']))
            return

        gene_label = self.label_hash[gene_id]
        # some variant of gene_id has phenotype d
        vl = '_:'+re.sub(r'NCBIGene:', '', str(gene_id)) + 'VL'
        geno.addAllele(vl, 'some variant of ' + gene_label)
        geno.addAlleleOfGene(vl, gene_id)
        geno.addAffectedLocus(vl, gene_id)
        model.addBlankNodeAnnotation(vl)
        assoc = G2PAssoc(self.g, self.name, vl, phene_id)
        assoc.add_association_to_graph()

        # add the gene id to the set of annotated genes
        # for later lookup by orthology
        self.annotated_genes.add(gene_id)

        return
Exemplo n.º 4
0
    def _add_snp_gene_relation(self, snp_id, snp_gene_nums,
                               upstream_gene_num, downstream_gene_num):
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        geno = Genotype(g)
        # add the feature as a sequence alteration
        # affecting various genes
        # note that intronic variations don't necessarily list
        # the genes such as for rs10448080  FIXME
        if snp_gene_nums != '':
            for s in re.split(r',', snp_gene_nums):
                s = s.strip()
                # still have to test for this,
                # because sometimes there's a leading comma
                if s != '':
                    gene_id = 'NCBIGene:' + s
                    geno.addAffectedLocus(snp_id, gene_id)

        # add the up and downstream genes if they are available
        if upstream_gene_num != '':
            downstream_gene_id = 'NCBIGene:' + downstream_gene_num
            g.addTriple(
                snp_id,
                Feature.object_properties[
                    r'upstream_of_sequence_of'],
                downstream_gene_id)
        if downstream_gene_num != '':
            upstream_gene_id = 'NCBIGene:' + upstream_gene_num
            g.addTriple(
                snp_id,
                Feature.object_properties[
                    'downstream_of_sequence_of'],
                upstream_gene_id)
Exemplo n.º 5
0
    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(
            graph_type,
            are_bnodes_skolemized,
            'decipher',
            ingest_title='Development Disorder Genotype Phenotype Database',
            ingest_url='https://decipher.sanger.ac.uk/',
            license_url='https://decipher.sanger.ac.uk/legal',
            data_rights='https://decipher.sanger.ac.uk/datasharing',
            # file_handle=None
        )

        if 'disease' not in self.all_test_ids:
            LOG.warning("not configured with disease test ids.")
            self.test_ids = []
        else:
            self.test_ids = self.all_test_ids['disease']

        self.graph = self.graph
        self.geno = Genotype(self.graph)
        self.model = Model(self.graph)
        self.graph_type = graph_type
        self.are_bnodes_skolemized = are_bnodes_skolemized

        return
Exemplo n.º 6
0
Arquivo: CTD.py Projeto: sgml/dipper
    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(
            graph_type,
            are_bnodes_skolemized,
            'ctd',
            ingest_title='Comparative Toxicogenomics Database',
            ingest_url='http://ctdbase.org',
            license_url=None,
            data_rights='http://ctdbase.org/about/legal.jsp'
            # file_handle=None
        )

        if 'gene' not in self.all_test_ids:
            LOG.warning("not configured with gene test ids.")
            self.test_geneids = []
        else:
            self.test_geneids = self.all_test_ids['gene']

        if 'disease' not in self.all_test_ids:
            LOG.warning("not configured with disease test ids.")
            self.test_diseaseids = []
        else:
            self.test_diseaseids = self.all_test_ids['disease']

        self.geno = Genotype(self.graph)
        self.pathway = Pathway(self.graph)

        return
Exemplo n.º 7
0
    def _add_snp_gene_relation(self, snp_id, snp_gene_nums, upstream_gene_num,
                               downstream_gene_num):
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        # add the feature as a sequence alteration
        # affecting various genes
        # note that intronic variations don't necessarily list
        # the genes such as for rs10448080  FIXME
        if snp_gene_nums != '':
            for geneid in re.split(r',', snp_gene_nums):
                geneid = geneid.strip()
                # still have to test for this,
                # because sometimes there's a leading comma
                if geneid != '':
                    geno.addAffectedLocus(snp_id, 'ENSEMBL:' + geneid)

        # add the up and downstream genes if they are available
        if upstream_gene_num != '':
            downstream_gene_id = 'ENSEMBL:' + downstream_gene_num
            graph.addTriple(snp_id,
                            self.globaltt['is upstream of sequence of'],
                            downstream_gene_id)
        if downstream_gene_num != '':
            upstream_gene_id = 'ENSEMBL:' + upstream_gene_num
            graph.addTriple(snp_id,
                            self.globaltt['is downstream of sequence of'],
                            upstream_gene_id)
Exemplo n.º 8
0
    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(graph_type, are_bnodes_skolemized, 'ctd')
        self.dataset = Dataset(
            'ctd', 'CTD', 'http://ctdbase.org', None,
            'http://ctdbase.org/about/legal.jsp')

        if 'test_ids' not in config.get_config() \
                or 'gene' not in config.get_config()['test_ids']:
            logger.warning("not configured with gene test ids.")
            self.test_geneids = []
        else:
            self.test_geneids = config.get_config()['test_ids']['gene']

        if 'test_ids' not in config.get_config() \
                or 'disease' not in config.get_config()['test_ids']:
            logger.warning("not configured with disease test ids.")
            self.test_diseaseids = []
        else:
            self.test_diseaseids = config.get_config()['test_ids']['disease']

        self.g = self.graph
        self.geno = Genotype(self.graph)
        self.pathway = Pathway(self.graph)

        return
Exemplo n.º 9
0
    def _build_gene_disease_model(self,
                                  gene_id,
                                  relation_id,
                                  disease_id,
                                  variant_label,
                                  consequence_predicate=None,
                                  consequence_id=None,
                                  allelic_requirement=None,
                                  pmids=None):
        """
        Builds gene variant disease model

        :return: None
        """
        model = Model(self.graph)
        geno = Genotype(self.graph)

        pmids = [] if pmids is None else pmids

        is_variant = False
        variant_or_gene = gene_id

        variant_id_string = variant_label
        variant_bnode = self.make_id(variant_id_string, "_")

        if consequence_predicate is not None \
                and consequence_id is not None:
            is_variant = True
            model.addTriple(variant_bnode, consequence_predicate,
                            consequence_id)
            # Hack to add labels to terms that
            # don't exist in an ontology
            if consequence_id.startswith(':'):
                model.addLabel(consequence_id,
                               consequence_id.strip(':').replace('_', ' '))

        if is_variant:
            variant_or_gene = variant_bnode
            # Typically we would type the variant using the
            # molecular consequence, but these are not specific
            # enough for us to make mappings (see translation table)
            model.addIndividualToGraph(variant_bnode, variant_label,
                                       self.globaltt['variant_locus'])
            geno.addAffectedLocus(variant_bnode, gene_id)
            model.addBlankNodeAnnotation(variant_bnode)

        assoc = G2PAssoc(self.graph, self.name, variant_or_gene, disease_id,
                         relation_id)
        assoc.source = pmids
        assoc.add_association_to_graph()

        if allelic_requirement is not None and is_variant is False:
            model.addTriple(assoc.assoc_id,
                            self.globaltt['has_allelic_requirement'],
                            allelic_requirement)
            if allelic_requirement.startswith(':'):
                model.addLabel(
                    allelic_requirement,
                    allelic_requirement.strip(':').replace('_', ' '))
Exemplo n.º 10
0
    def process_disease_association(self, limit):

        raw = '/'.join((self.rawdir, self.files['disease_assoc']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        logger.info("Processing disease models")
        geno = Genotype(g)
        line_counter = 0
        worm_taxon = 'NCBITaxon:6239'
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, disease_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                # WB	WBGene00000001	aap-1		DOID:2583	PMID:19029536	IEA	ENSEMBL:ENSG00000145675|OMIM:615214	D		Y110A7A.10	gene	taxon:6239	20150612	WB
                gene_id = 'WormBase:' + gene_num

                # make a variant of the gene
                vl = '_:' + '-'.join((gene_num, 'unspecified'))
                vl_label = 'some variant of ' + gene_symbol
                geno.addAffectedLocus(vl, gene_id)
                model.addBlankNodeAnnotation(vl)
                animal_id = geno.make_experimental_model_with_genotype(
                    vl, vl_label, worm_taxon, 'worm')

                assoc = G2PAssoc(g, self.name, animal_id, disease_id,
                                 model.object_properties['model_of'])
                ref = re.sub(r'WB_REF:', 'WormBase:', ref)
                if ref != '':
                    assoc.add_source(ref)
                eco_id = None
                if eco_symbol == 'IEA':
                    eco_id = 'ECO:0000501'  # IEA is this now
                if eco_id is not None:
                    assoc.add_evidence(eco_id)

                assoc.add_association_to_graph()

        return
Exemplo n.º 11
0
 def setUp(self):
     self.graph = RDFGraph()
     self.curie_map = curie_map.get()
     self.genotype = Genotype(self.graph)
     self.cutil = CurieUtil(self.curie_map)
     self.test_cat_pred = self.cutil.get_uri(blv.terms['category'])
     self.test_cat_genotype_category = self.cutil.get_uri(
         blv.terms['Genotype'])
     self.test_cat_background_category = self.cutil.get_uri(
         blv.terms['PopulationOfIndividualOrganisms'])
Exemplo n.º 12
0
    def parse(self, limit=None):
        """

        :param limit:
        :return:
        """
        if limit is not None:
            logger.info("Only parsing first %s rows fo each file", str(limit))

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True
            g = self.testgraph
        else:
            g = self.graph

        tmap = '/'.join((self.rawdir, self.files['trait_mappings']['file']))
        self._process_trait_mappings(tmap, limit)

        geno = Genotype(g)
        # organisms  = ['chicken']
        organisms = [
            'chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle']

        for o in organisms:
            tax_id = self._get_tax_by_common_name(o)
            geno.addGenome(tax_id, o)
            build_id = None
            build = None

            k = o + '_bp'
            if k in self.files:
                file = self.files[k]['file']
                m = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', file)
                if m is None:
                    logger.error("Can't match a gff build")
                else:
                    build = m.group(1)
                    build_id = self._map_build_by_abbrev(build)
                    logger.info("Build = %s", build_id)
                    geno.addReferenceGenome(build_id, build, tax_id)
                if build_id is not None:
                    self._process_QTLs_genomic_location(
                        '/'.join((self.rawdir, file)), tax_id, build_id, build,
                        limit)

            k = o+'_cm'
            if k in self.files:
                file = self.files[k]['file']
                self._process_QTLs_genetic_location(
                    '/'.join((self.rawdir, file)), tax_id, o, limit)

        logger.info("Finished parsing")
        return
Exemplo n.º 13
0
    def _make_pheno_assoc(self, g, gene_id, gene_symbol, disorder_num,
                          disorder_label, phene_key):

        geno = Genotype(g)
        model = Model(g)
        disorder_id = ':'.join(('OMIM', disorder_num))
        rel_id = model.object_properties['has_phenotype']  # default
        rel_label = 'causes'
        if re.match(r'\[', disorder_label):
            rel_id = model.object_properties['is_marker_for']
            rel_label = 'is a marker for'
        elif re.match(r'\{', disorder_label):
            rel_id = model.object_properties['contributes_to']
            rel_label = 'contributes to'
        elif re.match(r'\?', disorder_label):
            # this is a questionable mapping!  skip?
            rel_id = model.object_properties['contributes_to']
            rel_label = 'contributes to'

        evidence = self._map_phene_mapping_code_to_eco(phene_key)

        # we actually want the association between the gene and the disease
        # to be via an alternate locus not the "wildtype" gene itself.
        # so we make an anonymous alternate locus,
        # and put that in the association.
        # but we only need to do that in the cases when it's not an NCBIGene
        # (as that is a sequence feature itself)
        if re.match(r'OMIM:', gene_id):
            alt_locus = '_:' + re.sub(r':', '',
                                      gene_id) + '-' + disorder_num + 'VL'
            alt_label = gene_symbol.strip()
            if alt_label is not None and alt_label != '':
                alt_label = \
                    ' '.join(('some variant of', alt_label,
                              'that', rel_label, disorder_label))
            else:
                alt_label = None

            model.addIndividualToGraph(alt_locus, alt_label,
                                       Genotype.genoparts['variant_locus'])
            geno.addAffectedLocus(alt_locus, gene_id)
            model.addBlankNodeAnnotation(alt_locus)

        else:
            # assume it's already been added
            alt_locus = gene_id

        assoc = G2PAssoc(g, self.name, alt_locus, disorder_id, rel_id)
        assoc.add_evidence(evidence)
        assoc.add_association_to_graph()

        return
Exemplo n.º 14
0
    def _process_all(self, limit):
        """
        This takes the list of omim identifiers from the omimTitles file,
        excludes those designated as obsolete and iteratively queries the omim api
        in batches of 20 for the json-formatted data.

        This will create OMIM classes, with the label & definition.
        If an entry is "removed",
            it is added as a deprecated class.
        If an entry is "moved",
            it is deprecated and consider annotations are added.

        Additionally, we extract:
        *phenotypicSeries ids as superclasses
        *equivalent ids for Orphanet and UMLS

        If set to testMode,
            it will write only those items in the test_ids to the testgraph.

        :param limit:
        """
        omimids = list(self.omim_type.keys() - self.omim_replaced.keys())

        LOG.info('Have %i omim numbers to fetch records from their API',
                 len(omimids))
        LOG.info('Have %i omim types ', len(self.omim_type))

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        tax_label = 'H**o sapiens'
        tax_id = self.globaltt[tax_label]

        # add genome and taxon
        geno.addGenome(tax_id, tax_label)
        model.addClassToGraph(tax_id, tax_label)

        includes = set()
        includes.add('all')

        self.process_entries(omimids, self._transform_entry, includes, graph,
                             limit)

        # since we are not fetching obsolete records any more add them all in here
        for omim_id in self.omim_replaced:
            model.addDeprecatedClass(
                'OMIM:' + omim_id,
                ['OMIM:' + o for o in self.omim_replaced[omim_id]])
Exemplo n.º 15
0
Arquivo: OMIA.py Projeto: sgml/dipper
 def _process_gene_row(self, row):
     model = Model(self.graph)
     geno = Genotype(self.graph)
     if self.test_mode and row['gene_id'] not in self.test_ids['gene']:
         return
     gene_id = 'NCBIGene:' + str(row['gene_id'])
     self.id_hash['gene'][row['gene_id']] = gene_id
     gene_label = row['symbol']
     self.label_hash[gene_id] = gene_label
     tax_id = 'NCBITaxon:' + str(row['gb_species_id'])
     if row['gene_type'] is not None:
         gene_type_id = self.resolve(row['gene_type'])
         model.addClassToGraph(gene_id, gene_label, gene_type_id)
     geno.addTaxon(tax_id, gene_id)
Exemplo n.º 16
0
    def process_gene_ids(self, limit):
        src_key = 'gene_ids'
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        graph = self.graph
        model = Model(graph)
        geno = Genotype(graph)
        col = self.files[src_key]['columns']
        LOG.info("Processing: %s", self.files[src_key]['file'])

        with gzip.open(raw, 'rb') as csvfile:
            reader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                delimiter=',',
                                quotechar='\"')
            # no header row to check
            collen = len(col)
            for row in reader:
                if len(row) != collen:
                    LOG.error('In %s line %i expected %i colums but got %s.',
                              self.files[src_key]['file'], reader.line_num,
                              collen, row)
                    pass
                taxon_num = row[col.index('taxon_num')]
                gene_num = row[col.index('gene_num')]
                gene_symbol = row[col.index('gene_symbol')]
                gene_synonym = row[col.index('gene_synonym')]
                live = row[col.index('live')]
                # gene_type = row[col.index('gene_type')]
                # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene

                taxon_curie = 'NCBITaxon:' + taxon_num
                gene_curie = 'WormBase:' + gene_num

                if gene_symbol == '':
                    gene_symbol = gene_synonym  # these are not the same in my book tec.
                if gene_symbol == '':
                    gene_symbol = None
                model.addClassToGraph(gene_curie, gene_symbol,
                                      self.globaltt['gene'])
                if live == 'Dead':
                    model.addDeprecatedClass(gene_curie,
                                             old_id_category=blv.terms['Gene'])
                geno.addTaxon(taxon_curie, gene_curie)
                if gene_synonym is not None and gene_synonym != '':
                    model.addSynonym(gene_curie, gene_synonym)

                if limit is not None and reader.line_num > limit:
                    break
Exemplo n.º 17
0
 def _add_variant_gene_relationship(self, variant_id, hgnc_symbol):
     """
     :param variant_id
     :param hgnc_symbol
     :return: None
     """
     gu = GraphUtils(curie_map.get())
     geno = Genotype(self.graph)
     if hgnc_symbol in self.gene_map:
         gene_id = self.gene_map[hgnc_symbol]
     else:
         gene_id = self.make_cgd_id("{0}{1}".format(variant_id, hgnc_symbol))
         logger.warn("Can't map gene symbol {0} "
                     "to entrez ID".format(hgnc_symbol))
     gu.addClassToGraph(self.graph, gene_id, hgnc_symbol)
     geno.addAlleleOfGene(variant_id, gene_id)
     return
Exemplo n.º 18
0
    def _process_all(self, limit):
        """
        This takes the list of omim identifiers from the omim.txt.Z file,
        and iteratively queries the omim api for the json-formatted data.
        This will create OMIM classes, with the label,
        definition, and some synonyms.
        If an entry is "removed",
            it is added as a deprecated class.
        If an entry is "moved",
            it is deprecated and consider annotations are added.

        Additionally, we extract:
        *phenotypicSeries ids as superclasses
        *equivalent ids for Orphanet and UMLS

        If set to testMode,
            it will write only those items in the test_ids to the testgraph.

        :param limit:
        :return:
        """
        omimids = self._get_omim_ids()  # store the set of omim identifiers

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        geno = Genotype(g)
        model = Model(g)
        # tax_num = '9606'   # TODO PYLINT unused
        tax_id = 'NCBITaxon:9606'
        tax_label = 'Human'

        # add genome and taxon
        geno.addGenome(tax_id, tax_label)   # tax label can get added elsewhere
        model.addClassToGraph(tax_id, None)   # label added elsewhere

        includes = set()
        includes.add('all')

        self.process_entries(
            omimids, self._transform_entry, includes, g, limit)

        return
Exemplo n.º 19
0
    def process_gene_ids(self, limit):
        raw = '/'.join((self.rawdir, self.files['gene_ids']['file']))

        if self.testMode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        logger.info("Processing: %s", self.files['gene_ids']['file'])
        line_counter = 0
        geno = Genotype(graph)
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter=',',
                                    quotechar='\"')
            for row in filereader:
                line_counter += 1
                (taxon_num, gene_num, gene_symbol, gene_synonym, live,
                 gene_type) = row
                # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                taxon_id = 'NCBITaxon:' + taxon_num
                gene_id = 'WormBase:' + gene_num
                if gene_symbol == '':
                    gene_symbol = gene_synonym
                if gene_symbol == '':
                    gene_symbol = None
                model.addClassToGraph(gene_id, gene_symbol,
                                      self.globaltt['gene'])
                if live == 'Dead':
                    model.addDeprecatedClass(gene_id)
                geno.addTaxon(taxon_id, gene_id)
                if gene_synonym != '' and gene_synonym is not None:
                    model.addSynonym(gene_id, gene_synonym)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Exemplo n.º 20
0
    def _create_genome_builds(self):
        """
        Various resources will map variations to either UCSC (hg*)
        or to NCBI assemblies. Here we create the equivalences between them.
        Data taken from:
        https://genome.ucsc.edu/FAQ/FAQreleases.html#release1

        :return:

        """

        # TODO add more species

        graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        logger.info("Adding equivalent assembly identifiers")
        for sp in self.species:
            tax_id = self.resolve(sp)
            txid_num = tax_id.split(':')[1]
            for key in self.files[txid_num]['assembly']:
                ucsc_id = key
                try:
                    ucsc_label = ucsc_id.split(':')[1]
                except IndexError:
                    logger.error('%s Assembly id:  "%s" is problematic', sp,
                                 key)
                    continue
                if key in self.localtt:
                    mapped_id = self.localtt[key]
                else:
                    logger.error(
                        '%s Assembly id:  "%s" is not in local translation table',
                        sp, key)

                mapped_label = mapped_id.split(':')[1]

                mapped_label = 'NCBI build ' + str(mapped_label)
                geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id)
                geno.addReferenceGenome(mapped_id, mapped_label, tax_id)
                model.addSameIndividual(ucsc_id, mapped_id)

        return
Exemplo n.º 21
0
    def parse(self, limit=None):
        if limit is not None:
            LOG.info("Only parsing first %s rows", limit)

        LOG.info("Parsing files...")

        if self.test_only:
            self.test_mode = True
            self.graph = self.testgraph
        else:
            self.graph = self.graph

        self.geno = Genotype(self.graph)

        # rare disease-phenotype associations
        self._process_ddg2p_annotations(limit)

        LOG.info("Finished parsing.")

        return
Exemplo n.º 22
0
    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(graph_type, are_bnodes_skolemized, 'decipher')

        self.dataset = Dataset(
            'decipher', 'Development Disorder Genotype – Phenotype Database',
            'https://decipher.sanger.ac.uk/', None,
            'https://decipher.sanger.ac.uk/legal')

        if 'test_ids' not in config.get_config() \
                or 'disease' not in config.get_config()['test_ids']:
            logger.warning("not configured with disease test ids.")
            self.test_ids = []
        else:
            self.test_ids = config.get_config()['test_ids']['disease']

        self.g = self.graph
        self.geno = Genotype(self.g)
        self.model = Model(self.g)

        return
Exemplo n.º 23
0
    def _parse_genepage2gene(self, limit) -> Dict[str, List[str]]:
        """
        :return:
        """
        src_key = 'genepage2gene'
        columns = self.files[src_key]['columns']
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))

        geno = Genotype(self.graph)
        genepage2gene = {}

        LOG.info("Processing GenePage to Gene file")

        with open(raw, 'r', encoding="utf8") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t')

            for row in reader:

                gene_page = row[columns.index('gene_page_id')]
                # gene_page_label = row[columns.index('gene_page_label')]
                tropicalis_id = row[columns.index('tropicalis_id')]
                tropicalis_label = row[columns.index('tropicalis_label')]
                laevis_l_id = row[columns.index('laevis_l_id')]
                laevis_l_label = row[columns.index('laevis_l_label')]
                laevis_s_id = row[columns.index('laevis_s_id')]
                laevis_s_label = row[columns.index('laevis_s_label')]

                tropicalis_curie = 'Xenbase:' + tropicalis_id
                laevis_l_curie = 'Xenbase:' + laevis_l_id
                laevis_s_curie = 'Xenbase:' + laevis_s_id

                genepage2gene[gene_page] = [tropicalis_curie, laevis_l_curie, laevis_s_curie]

                geno.addGene(tropicalis_curie, tropicalis_label)
                geno.addGene(laevis_l_curie, laevis_l_label)
                geno.addGene(laevis_s_curie, laevis_s_label)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break

        return genepage2gene
Exemplo n.º 24
0
    def parse(self, limit=None):
        """
        Override Source.parse()
        Parses version and interaction information from CTD
        Args:
        :param limit (int, optional) limit the number of rows processed
        Returns:
        :return None
        """
        if limit is not None:
            logger.info("Only parsing first %d rows", limit)

        logger.info("Parsing files...")
        # pub_map = dict()
        # file_path = '/'.join((self.rawdir,
        # self.static_files['publications']['file']))
        # if os.path.exists(file_path) is True:
        #     pub_map = self._parse_publication_file(
        #         self.static_files['publications']['file']
        #     )

        if self.testOnly:
            self.testMode = True

        if self.testMode:
            self.g = self.testgraph
        else:
            self.g = self.graph
        self.geno = Genotype(self.g)
        self.pathway = Pathway(self.g)

        self._parse_ctd_file(
            limit, self.files['chemical_disease_interactions']['file'])
        self._parse_ctd_file(limit, self.files['gene_pathway']['file'])
        self._parse_ctd_file(limit, self.files['gene_disease']['file'])
        self._parse_curated_chem_disease(limit)

        logger.info("Done parsing files.")

        return
Exemplo n.º 25
0
    def parse(self, limit=None):
        """
        Override Source.parse()
        Parses version and interaction information from CTD
        Args:
        :param limit (int, optional) limit the number of rows processed
        Returns:
        :return None
        """
        if limit is not None:
            LOG.info("Only parsing first %d rows", limit)

        LOG.info("Parsing files...")

        if self.test_only:
            self.test_mode = True

        self.geno = Genotype(self.graph)
        self.pathway = Pathway(self.graph)

        src_key = 'chemical_disease_associations'
        self._parse_ctd_file(limit, src_key)
Exemplo n.º 26
0
 def _add_gene_anatomy_association(self, gene_id, anatomy_curie, rank):
     """
     :param gene_id: str Non curified ID
     :param gene_label: str Gene symbol
     :param anatomy_curie: str curified anatomy term
     :param rank: str rank
     :return: None
     """
     g2a_association = Assoc(self.graph, self.name)
     genotype = Genotype(self.graph)
     model = Model(self.graph)
     gene_curie = "ENSEMBL:{}".format(gene_id)
     rank = re.sub(r',', '', rank)
     model.addIndividualToGraph(ind_id=gene_curie, label=None,
                                ind_type=genotype.genoparts['gene'])
     g2a_association.sub = gene_curie
     g2a_association.obj = anatomy_curie
     g2a_association.rel = Assoc.object_properties['expressed_in']
     g2a_association.add_association_to_graph()
     g2a_association.add_predicate_object(
         Assoc.datatype_properties['has_quantifier'],
         float(rank), 'Literal', 'xsd:float')
     return
Exemplo n.º 27
0
    def _add_g2p_assoc(self, graph, strain_id, sex, assay_id, phenotypes,
                       comment):
        """
        Create an association between a sex-specific strain id
        and each of the phenotypes.
        Here, we create a genotype from the strain,
        and a sex-specific genotype.
        Each of those genotypes are created as anonymous nodes.

        The evidence code is hardcoded to be:
            ECO:experimental_phenotypic_evidence.

        :param g:
        :param strain_id:
        :param sex:
        :param assay_id:
        :param phenotypes: a list of phenotypes to association with the strain
        :param comment:
        :return:

        """
        geno = Genotype(graph)
        model = Model(graph)
        eco_id = self.globaltt['experimental phenotypic evidence']
        strain_label = self.idlabel_hash.get(strain_id)
        # strain genotype
        genotype_id = '_' + '-'.join((re.sub(r':', '', strain_id), 'genotype'))
        genotype_label = '[' + strain_label + ']'

        sex_specific_genotype_id = '_' + '-'.join(
            (re.sub(r':', '', strain_id), sex, 'genotype'))
        if strain_label is not None:
            sex_specific_genotype_label = strain_label + ' (' + sex + ')'
        else:
            sex_specific_genotype_label = strain_id + '(' + sex + ')'

        genotype_type = self.globaltt['sex_qualified_genotype']
        if sex == 'm':
            genotype_type = self.globaltt['male_genotype']
        elif sex == 'f':
            genotype_type = self.globaltt['female_genotype']

        # add the genotype to strain connection
        geno.addGenotype(genotype_id, genotype_label,
                         self.globaltt['genomic_background'])
        graph.addTriple(strain_id, self.globaltt['has_genotype'], genotype_id)

        geno.addGenotype(sex_specific_genotype_id, sex_specific_genotype_label,
                         genotype_type)

        # add the strain as the background for the genotype
        graph.addTriple(sex_specific_genotype_id,
                        self.globaltt['has_sex_agnostic_part'], genotype_id)

        # #############    BUILD THE G2P ASSOC    #############
        # TODO add more provenance info when that model is completed

        if phenotypes is not None:
            for phenotype_id in phenotypes:
                assoc = G2PAssoc(graph, self.name, sex_specific_genotype_id,
                                 phenotype_id)
                assoc.add_evidence(assay_id)
                assoc.add_evidence(eco_id)
                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()
                model.addComment(assoc_id, comment)
                model._addSexSpecificity(assoc_id, self.resolve(sex))

        return
Exemplo n.º 28
0
    def process_gaf(self, file, limit, id_map=None, eco_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", file)
        line_counter = 0
        uniprot_hit = 0
        uniprot_miss = 0
        if '7955' in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        if '6239' in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue

                if len(row) > 17 or len(row) < 15:
                    LOG.warning(
                        "Wrong number of columns %i, expected 15 or 17\n%s",
                        len(row), row)
                    continue

                if 17 > len(row) >= 15:
                    row += [""] * (17 - len(row))

                (dbase, gene_num, gene_symbol, qualifier, go_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 object_type, taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (dbase == '' or gene_num == '' or gene_symbol == ''
                        or go_id == '' or ref == '' or eco_symbol == ''
                        or aspect == '' or object_type == '' or taxon == ''
                        or date == '' or assigned_by == ''):
                    LOG.error(
                        "Missing required part of annotation on row %d:\n" +
                        '\t'.join(row), line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None and gene_num in id_map:
                        gene_id = id_map[gene_num]
                        uniprotid = ':'.join((dbase, gene_num))
                        (dbase, gene_num) = gene_id.split(':')
                        uniprot_hit += 1
                    else:
                        # LOG.warning(
                        #   "UniProt id %s  is without a 1:1 mapping to entrez/ensembl",
                        #    gene_num)
                        uniprot_miss += 1
                        continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and not (re.match(r'NCBIGene', gene_id)
                                           and int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, syn.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    LOG.info(">1 taxon (%s) on line %d.  skipping", taxon,
                             line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = eco_map[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[0]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to',
                                                   qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n",
                                str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                #######################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i):
                            LOG.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s", uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = wbase.make_reagent_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, i, phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[0]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(self.globaltt[
                                    'experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.test_mode and limit is not None and line_counter > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download",
                uniprot_per, uniprot_tot)
        return
Exemplo n.º 29
0
    def _process_haplotype(self, hap_id, hap_label, chrom_num, chrom_pos,
                           context, risk_allele_frequency, mapped_gene,
                           so_ontology):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        # add the feature to the graph
        hap_description = None
        if risk_allele_frequency not in ['', 'NR']:
            hap_description = str(
                risk_allele_frequency) + ' [risk allele frequency]'

        model.addIndividualToGraph(hap_id, hap_label.strip(),
                                   self.globaltt['haplotype'], hap_description)
        geno.addTaxon(self.globaltt["H**o sapiens"], hap_id)

        snp_labels = re.split(r';\s?', hap_label)
        chrom_nums = re.split(r';\s?', chrom_num)
        chrom_positions = re.split(r';\s?', chrom_pos)
        context_list = re.split(r';\s?', context)
        mapped_genes = re.split(r';\s?', mapped_gene)

        # Not having four "PAX5" as a list might be better, but it breaks unit tests
        # mapped_genes = list(set(mapped_genes)) # make uniq
        # snp_labels = list(set(snp_labels)) # make uniq

        snp_curies = list()

        for snp in snp_labels:
            snp_curie, snp_type = self._get_curie_and_type_from_id(snp)
            if snp_type is None:
                LOG.info('cant find type for SNP in %s', snp)
                # make blank node
                snp_curie = self.make_id(snp, "_")
                model.addLabel(snp_curie, snp)
            elif snp_curie[0] == '_':  # arrived an unlabeled blanknode
                model.addLabel(snp_curie, snp)

            graph.addTriple(hap_id, self.globaltt['has_variant_part'],
                            snp_curie)
            snp_curies.append(snp_curie)

        # courtesy http://stackoverflow.com/a/16720915
        # check lengths of mutiple lists
        length = len(snp_curies)
        if not all(
                len(lst) == length for lst in
            [snp_labels, chrom_nums, chrom_positions, context_list]):
            LOG.warning(
                "Incongruous data field(s) for haplotype %s \n "
                "will not add snp details", hap_label)
        else:

            variant_in_gene_count = 0
            for index, snp_curie in enumerate(snp_curies):
                self._add_snp_to_graph(snp_curie, snp_labels[index],
                                       chrom_nums[index],
                                       chrom_positions[index],
                                       context_list[index])

                if mapped_genes and len(mapped_genes) != len(snp_labels):
                    LOG.warning("More mapped genes than snps,"
                                " cannot disambiguate for\n%s\n%s",
                                mapped_genes, snp_labels)  # hap_label)
                else:
                    so_class = self.resolve(context_list[index])
                    so_query = """
        SELECT ?variant_label
        WHERE {{
            {0} rdfs:subClassOf+ {1} ;
            rdfs:label ?variant_label .
        }}
                    """.format(so_class, self.globaltt['gene_variant'])

                    query_result = so_ontology.query(so_query)

                    gene_id = DipperUtil.get_hgnc_id_from_symbol(
                        mapped_genes[index])

                    if gene_id is not None and len(list(query_result)) == 1:
                        if context_list[index] in [
                                'upstream_gene_variant',
                                'downstream_gene_variant'
                        ]:
                            graph.addTriple(snp_curie,
                                            self.resolve(context_list[index]),
                                            gene_id)
                        else:
                            geno.addAffectedLocus(snp_curie, gene_id)
                            variant_in_gene_count += 1

            # Seperate in case we want to apply a different relation
            # If not this is redundant with triples added above
            if len(mapped_genes) == variant_in_gene_count and \
                    len(set(mapped_genes)) == 1:
                gene_id = DipperUtil.get_hgnc_id_from_symbol(mapped_genes[0])
                geno.addAffectedLocus(hap_id, gene_id)
Exemplo n.º 30
0
    def _process_data(self, raw, limit=None):
        LOG.info("Processing Data from %s", raw)

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        geno = Genotype(graph)

        # Add the taxon as a class
        taxon_id = self.globaltt['Mus musculus']
        model.addClassToGraph(taxon_id, None)

        # with open(raw, 'r', encoding="utf8") as csvfile:
        col = self.files['all']['columns']
        with gzip.open(raw, 'rt') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            row = next(reader)  # presumed header
            if not self.check_fileheader(col, row):
                pass

            for row in reader:
                # | head -1 | tr ',' '\n' | sed "s|\(.*\)|# \1 = row[col.index('\1')]|g"
                marker_accession_id = row[col.index('marker_accession_id')].strip()
                marker_symbol = row[col.index('marker_symbol')].strip()
                phenotyping_center = row[col.index('phenotyping_center')].strip()
                colony_raw = row[col.index('colony_id')].strip()
                sex = row[col.index('sex')].strip()
                zygosity = row[col.index('zygosity')].strip()
                allele_accession_id = row[col.index('allele_accession_id')].strip()
                allele_symbol = row[col.index('allele_symbol')].strip()
                # allele_name = row[col.index('allele_name')]
                strain_accession_id = row[col.index('strain_accession_id')].strip()
                strain_name = row[col.index('strain_name')].strip()
                # project_name = row[col.index('project_name')]
                project_fullname = row[col.index('project_fullname')].strip()
                pipeline_name = row[col.index('pipeline_name')].strip()
                pipeline_stable_id = row[col.index('pipeline_stable_id')].strip()
                procedure_stable_id = row[col.index('procedure_stable_id')].strip()
                procedure_name = row[col.index('procedure_name')].strip()
                parameter_stable_id = row[col.index('parameter_stable_id')].strip()
                parameter_name = row[col.index('parameter_name')].strip()
                # top_level_mp_term_id = row[col.index('top_level_mp_term_id')]
                # top_level_mp_term_name = row[col.index('top_level_mp_term_name')]
                mp_term_id = row[col.index('mp_term_id')].strip()
                mp_term_name = row[col.index('mp_term_name')].strip()
                p_value = row[col.index('p_value')].strip()
                percentage_change = row[col.index('percentage_change')].strip()
                effect_size = row[col.index('effect_size')].strip()
                statistical_method = row[col.index('statistical_method')].strip()
                resource_name = row[col.index('resource_name')].strip()

                if self.test_mode and marker_accession_id not in self.gene_ids:
                    continue

                # ##### cleanup some of the identifiers ######
                zygosity = zygosity.strip()
                zygosity_id = self.resolve(zygosity)
                if zygosity_id == zygosity:
                    LOG.warning(
                        "Zygosity '%s' unmapped. detting to indeterminate", zygosity)
                    zygosity_id = self.globaltt['indeterminate']

                # colony ids sometimes have <> in them, spaces,
                # or other non-alphanumerics and break our system;
                # replace these with underscores
                colony_id = '_:' + re.sub(r'\W+', '_', colony_raw)

                if not re.match(r'MGI', allele_accession_id):
                    allele_accession_id = '_:IMPC-'+re.sub(
                        r':', '', allele_accession_id)

                if re.search(r'EUROCURATE', strain_accession_id):
                    # the eurocurate links don't resolve at IMPC
                    # TODO blank nodes do not maintain identifiers
                    strain_accession_id = '_:' + strain_accession_id

                elif not re.match(r'MGI', strain_accession_id):
                    LOG.info(
                        "Found a strange strain accession...%s", strain_accession_id)
                    strain_accession_id = 'IMPC:'+strain_accession_id

                ######################
                # first, add the marker and variant to the graph as with MGI,
                # the allele is the variant locus.  IF the marker is not known,
                # we will call it a sequence alteration.  otherwise,
                # we will create a BNode for the sequence alteration.
                sequence_alteration_id = variant_locus_id = None
                variant_locus_name = sequence_alteration_name = None

                # extract out what's within the <> to get the symbol
                if re.match(r'.*<.*>', allele_symbol):
                    sequence_alteration_name = re.match(
                        r'.*<(.*)>', allele_symbol)
                    if sequence_alteration_name is not None:
                        sequence_alteration_name = sequence_alteration_name.group(1)
                else:
                    sequence_alteration_name = allele_symbol

                if marker_accession_id is not None and marker_accession_id == '':
                    LOG.warning("Marker unspecified on row %d", reader.line_num)
                    marker_accession_id = None

                if marker_accession_id is not None:
                    variant_locus_id = allele_accession_id
                    variant_locus_name = allele_symbol
                    variant_locus_type = self.globaltt['variant_locus']
                    geno.addGene(
                        marker_accession_id, marker_symbol, self.globaltt['gene'])

                    geno.addAllele(
                        variant_locus_id, variant_locus_name, variant_locus_type, None)
                    geno.addAlleleOfGene(variant_locus_id, marker_accession_id)

                    # TAG bnode
                    sequence_alteration_id = '_:seqalt' + re.sub(
                        r':', '', allele_accession_id)
                    geno.addSequenceAlterationToVariantLocus(
                        sequence_alteration_id, variant_locus_id)

                else:
                    sequence_alteration_id = allele_accession_id

                # IMPC contains targeted mutations with either gene traps,
                # knockouts, insertion/intragenic deletions.
                # but I don't really know what the SeqAlt is here,
                # so I don't add it.
                geno.addSequenceAlteration(
                    sequence_alteration_id, sequence_alteration_name)

                # #############    BUILD THE COLONY    #############
                # First, let's describe the colony that the animals come from
                # The Colony ID refers to the ES cell clone
                #   used to generate a mouse strain.
                # Terry sez: we use this clone ID to track
                #   ES cell -> mouse strain -> mouse phenotyping.
                # The same ES clone maybe used at multiple centers,
                # so we have to concatenate the two to have a unique ID.
                # some useful reading about generating mice from ES cells:
                # http://ki.mit.edu/sbc/escell/services/details

                # here, we'll make a genotype
                # that derives from an ES cell with a given allele.
                # the strain is not really attached to the colony.

                # the colony/clone is reflective of the allele,  with unknown zygosity

                stem_cell_class = self.globaltt['embryonic stem cell line']

                if colony_id is None:
                    print(colony_raw, stem_cell_class, "\nline:\t", reader.line_num)
                model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class)

                # vslc of the colony has unknown zygosity
                # note that we will define the allele
                # (and it's relationship to the marker, etc.) later
                # FIXME is it really necessary to create this vslc
                # when we always know it's unknown zygosity?
                vslc_colony = '_:'+re.sub(
                    r':', '', allele_accession_id + self.globaltt['indeterminate'])
                vslc_colony_label = allele_symbol + '/<?>'
                # for ease of reading, we make the colony genotype variables.
                # in the future, it might be desired to keep the vslcs
                colony_genotype_id = vslc_colony
                colony_genotype_label = vslc_colony_label
                geno.addGenotype(colony_genotype_id, colony_genotype_label)
                geno.addParts(
                    allele_accession_id, colony_genotype_id,
                    self.globaltt['has_variant_part'])

                geno.addPartsToVSLC(
                    vslc_colony, allele_accession_id, None,
                    self.globaltt['indeterminate'], self.globaltt['has_variant_part'])
                graph.addTriple(
                    colony_id, self.globaltt['has_genotype'], colony_genotype_id)

                # ##########    BUILD THE ANNOTATED GENOTYPE    ##########
                # now, we'll build the genotype of the individual that derives
                # from the colony/clone genotype that is attached to
                # phenotype = colony_id + strain + zygosity + sex
                # (and is derived from a colony)

                # this is a sex-agnostic genotype
                genotype_id = self.make_id(
                    (colony_id + phenotyping_center + zygosity + strain_accession_id))
                geno.addSequenceDerivesFrom(genotype_id, colony_id)

                # build the VSLC of the sex-agnostic genotype
                # based on the zygosity
                allele1_id = allele_accession_id
                allele2_id = allele2_rel = None
                allele1_label = allele_symbol
                allele2_label = '<?>'
                # Making VSLC labels from the various parts,
                # can change later if desired.
                if zygosity == 'heterozygote':
                    allele2_label = re.sub(r'<.*', '<+>', allele1_label)
                    allele2_id = None
                elif zygosity == 'homozygote':
                    allele2_label = allele1_label
                    allele2_id = allele1_id
                    allele2_rel = self.globaltt['has_variant_part']
                elif zygosity == 'hemizygote':
                    allele2_label = re.sub(r'<.*', '<0>', allele1_label)
                    allele2_id = None
                elif zygosity == 'not_applicable':
                    allele2_label = re.sub(r'<.*', '<?>', allele1_label)
                    allele2_id = None
                else:
                    LOG.warning("found unknown zygosity %s", zygosity)
                    break
                vslc_name = '/'.join((allele1_label, allele2_label))

                # Add the VSLC
                vslc_id = '-'.join(
                    (marker_accession_id, allele_accession_id, zygosity))
                vslc_id = re.sub(r':', '', vslc_id)
                vslc_id = '_:'+vslc_id
                model.addIndividualToGraph(
                    vslc_id, vslc_name,
                    self.globaltt['variant single locus complement'])
                geno.addPartsToVSLC(
                    vslc_id, allele1_id, allele2_id, zygosity_id,
                    self.globaltt['has_variant_part'], allele2_rel)

                # add vslc to genotype
                geno.addVSLCtoParent(vslc_id, genotype_id)

                # note that the vslc is also the gvc
                model.addType(vslc_id, self.globaltt['genomic_variation_complement'])

                # Add the genomic background
                # create the genomic background id and name
                if strain_accession_id != '':
                    genomic_background_id = strain_accession_id
                else:
                    genomic_background_id = None

                genotype_name = vslc_name
                if genomic_background_id is not None:
                    geno.addGenotype(
                        genomic_background_id, strain_name,
                        self.globaltt['genomic_background'])

                    # make a phenotyping-center-specific strain
                    # to use as the background
                    pheno_center_strain_label = strain_name + '-' + phenotyping_center \
                        + '-' + colony_raw
                    pheno_center_strain_id = '-'.join((
                        re.sub(r':', '', genomic_background_id),
                        re.sub(r'\s', '_', phenotyping_center),
                        re.sub(r'\W+', '', colony_raw)))
                    if not re.match(r'^_', pheno_center_strain_id):
                        # Tag bnode
                        pheno_center_strain_id = '_:' + pheno_center_strain_id

                    geno.addGenotype(
                        pheno_center_strain_id, pheno_center_strain_label,
                        self.globaltt['genomic_background'])
                    geno.addSequenceDerivesFrom(
                        pheno_center_strain_id, genomic_background_id)

                    # Making genotype labels from the various parts,
                    # can change later if desired.
                    # since the genotype is reflective of the place
                    # it got made, should put that in to disambiguate
                    genotype_name = \
                        genotype_name + ' [' + pheno_center_strain_label + ']'
                    geno.addGenomicBackgroundToGenotype(
                        pheno_center_strain_id, genotype_id)
                    geno.addTaxon(taxon_id, pheno_center_strain_id)
                # this is redundant, but i'll keep in in for now
                geno.addSequenceDerivesFrom(genotype_id, colony_id)
                geno.addGenotype(genotype_id, genotype_name)

                # Make the sex-qualified genotype,
                # which is what the phenotype is associated with
                sex_qualified_genotype_id = \
                    self.make_id((
                        colony_id + phenotyping_center + zygosity +
                        strain_accession_id + sex))
                sex_qualified_genotype_label = genotype_name + ' (' + sex + ')'

                sq_type_id = self.resolve(sex, False)

                if sq_type_id == sex:
                    sq_type_id = self.globaltt['intrinsic_genotype']
                    LOG.warning(
                        "Unknown sex qualifier %s, adding as intrinsic_genotype",
                        sex)

                geno.addGenotype(
                    sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id)
                geno.addParts(
                    genotype_id, sex_qualified_genotype_id,
                    self.globaltt['has_variant_part'])

                if genomic_background_id is not None and genomic_background_id != '':
                    # Add the taxon to the genomic_background_id
                    geno.addTaxon(taxon_id, genomic_background_id)
                else:
                    # add it as the genomic background
                    geno.addTaxon(taxon_id, genotype_id)

                # #############    BUILD THE G2P ASSOC    #############
                # from an old email dated July 23 2014:
                # Phenotypes associations are made to
                # imits colony_id+center+zygosity+gender

                # sometimes phenotype ids are missing.  (about 711 early 2020)
                if mp_term_id is None or mp_term_id == '':
                    LOG.warning(
                        "No phenotype id specified for row %d", reader.line_num)
                    continue
                # hard coded ECO code
                eco_id = self.globaltt['mutant phenotype evidence']

                # the association comes as a result of a g2p from
                # a procedure in a pipeline at a center and parameter tested

                assoc = G2PAssoc(
                    graph, self.name, sex_qualified_genotype_id, mp_term_id)
                assoc.add_evidence(eco_id)
                # assoc.set_score(float(p_value))

                # TODO add evidence instance using
                # pipeline_stable_id +
                # procedure_stable_id +
                # parameter_stable_id

                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()

                model._addSexSpecificity(assoc_id, self.resolve(sex))

                # add a free-text description
                try:
                    description = ' '.join((
                        mp_term_name, 'phenotype determined by', phenotyping_center,
                        'in an', procedure_name, 'assay where', parameter_name.strip(),
                        'was measured with an effect_size of',
                        str(round(float(effect_size), 5)),
                        '(p =', "{:.4e}".format(float(p_value)), ').'))
                except ValueError:
                    description = ' '.join((
                        mp_term_name, 'phenotype determined by', phenotyping_center,
                        'in an', procedure_name, 'assay where', parameter_name.strip(),
                        'was measured with an effect_size of', str(effect_size),
                        '(p =', "{0}".format(p_value), ').'))

                study_bnode = self._add_study_provenance(
                    phenotyping_center, colony_raw, project_fullname, pipeline_name,
                    pipeline_stable_id, procedure_stable_id, procedure_name,
                    parameter_stable_id, parameter_name, statistical_method,
                    resource_name)

                evidence_line_bnode = self._add_evidence(
                    assoc_id, eco_id, p_value, percentage_change, effect_size,
                    study_bnode)

                self._add_assertion_provenance(assoc_id, evidence_line_bnode)

                model.addDescription(evidence_line_bnode, description)

                # resource_id = resource_name
                # assoc.addSource(graph, assoc_id, resource_id)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break