def _process_phene_gene_row(self, row): geno = Genotype(self.graph) model = Model(self.graph) gene_id = self.id_hash['gene'].get(row['gene_id']) phene_id = self.id_hash['phene'].get(row['phene_id']) omia_id = self._get_omia_id_from_phene_id(phene_id) if self.test_mode and not (omia_id in self.test_ids['disease'] and row['gene_id'] in self.test_ids['gene'] ) or gene_id is None or phene_id is None: return # occasionally some phenes are missing! (ex: 406) if phene_id is None: LOG.warning("Phene id %s is missing", str(row['phene_id'])) return gene_label = self.label_hash[gene_id] # some variant of gene_id has phenotype d var = self.make_id(gene_id.split(':')[-1] + 'VL', '_') geno.addAllele(var, 'some variant of ' + gene_label) geno.addAlleleOfGene(var, gene_id) geno.addAffectedLocus(var, gene_id) model.addBlankNodeAnnotation(var) assoc = G2PAssoc(self.graph, self.name, var, phene_id) assoc.add_association_to_graph() # add the gene id to the set of annotated genes # for later lookup by orthology self.annotated_genes.add(gene_id)
def parse(self, limit=None): """ Override Source.parse() Parses version and interaction information from CTD Args: :param limit (int, optional) limit the number of rows processed Returns: :return None """ if limit is not None: LOG.info("Only parsing first %d rows", limit) LOG.info("Parsing files...") if self.test_only: self.test_mode = True self.geno = Genotype(self.graph) self.pathway = Pathway(self.graph) src_key = 'chemical_disease_associations' self._parse_ctd_file(limit, src_key) # self._parse_ctd_file(limit, 'gene_pathway') # self._parse_ctd_file(limit, 'gene_disease') src_key = 'publications' file_path = '/'.join((self.rawdir, self.api_fetch[src_key]['file'])) if os.path.exists(file_path) is True: self._parse_curated_chem_disease(file_path, limit) else: LOG.error('Batch Query file "%s" does not exist', file_path) LOG.info("Done parsing files.")
def _process_phene_gene_row(self, row): geno = Genotype(self.g) model = Model(self.g) gene_id = self.id_hash['gene'].get(row['gene_id']) phene_id = self.id_hash['phene'].get(row['phene_id']) omia_id = self._get_omia_id_from_phene_id(phene_id) if self.testMode and not ( omia_id in self.test_ids['disease'] and row['gene_id'] in self.test_ids['gene']) or\ gene_id is None or phene_id is None: return # occasionally some phenes are missing! (ex: 406) if phene_id is None: logger.warning("Phene id %s is missing", str(row['phene_id'])) return gene_label = self.label_hash[gene_id] # some variant of gene_id has phenotype d vl = '_:'+re.sub(r'NCBIGene:', '', str(gene_id)) + 'VL' geno.addAllele(vl, 'some variant of ' + gene_label) geno.addAlleleOfGene(vl, gene_id) geno.addAffectedLocus(vl, gene_id) model.addBlankNodeAnnotation(vl) assoc = G2PAssoc(self.g, self.name, vl, phene_id) assoc.add_association_to_graph() # add the gene id to the set of annotated genes # for later lookup by orthology self.annotated_genes.add(gene_id) return
def _add_snp_gene_relation(self, snp_id, snp_gene_nums, upstream_gene_num, downstream_gene_num): if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) # add the feature as a sequence alteration # affecting various genes # note that intronic variations don't necessarily list # the genes such as for rs10448080 FIXME if snp_gene_nums != '': for s in re.split(r',', snp_gene_nums): s = s.strip() # still have to test for this, # because sometimes there's a leading comma if s != '': gene_id = 'NCBIGene:' + s geno.addAffectedLocus(snp_id, gene_id) # add the up and downstream genes if they are available if upstream_gene_num != '': downstream_gene_id = 'NCBIGene:' + downstream_gene_num g.addTriple( snp_id, Feature.object_properties[ r'upstream_of_sequence_of'], downstream_gene_id) if downstream_gene_num != '': upstream_gene_id = 'NCBIGene:' + upstream_gene_num g.addTriple( snp_id, Feature.object_properties[ 'downstream_of_sequence_of'], upstream_gene_id)
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__( graph_type, are_bnodes_skolemized, 'decipher', ingest_title='Development Disorder Genotype Phenotype Database', ingest_url='https://decipher.sanger.ac.uk/', license_url='https://decipher.sanger.ac.uk/legal', data_rights='https://decipher.sanger.ac.uk/datasharing', # file_handle=None ) if 'disease' not in self.all_test_ids: LOG.warning("not configured with disease test ids.") self.test_ids = [] else: self.test_ids = self.all_test_ids['disease'] self.graph = self.graph self.geno = Genotype(self.graph) self.model = Model(self.graph) self.graph_type = graph_type self.are_bnodes_skolemized = are_bnodes_skolemized return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__( graph_type, are_bnodes_skolemized, 'ctd', ingest_title='Comparative Toxicogenomics Database', ingest_url='http://ctdbase.org', license_url=None, data_rights='http://ctdbase.org/about/legal.jsp' # file_handle=None ) if 'gene' not in self.all_test_ids: LOG.warning("not configured with gene test ids.") self.test_geneids = [] else: self.test_geneids = self.all_test_ids['gene'] if 'disease' not in self.all_test_ids: LOG.warning("not configured with disease test ids.") self.test_diseaseids = [] else: self.test_diseaseids = self.all_test_ids['disease'] self.geno = Genotype(self.graph) self.pathway = Pathway(self.graph) return
def _add_snp_gene_relation(self, snp_id, snp_gene_nums, upstream_gene_num, downstream_gene_num): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) # add the feature as a sequence alteration # affecting various genes # note that intronic variations don't necessarily list # the genes such as for rs10448080 FIXME if snp_gene_nums != '': for geneid in re.split(r',', snp_gene_nums): geneid = geneid.strip() # still have to test for this, # because sometimes there's a leading comma if geneid != '': geno.addAffectedLocus(snp_id, 'ENSEMBL:' + geneid) # add the up and downstream genes if they are available if upstream_gene_num != '': downstream_gene_id = 'ENSEMBL:' + downstream_gene_num graph.addTriple(snp_id, self.globaltt['is upstream of sequence of'], downstream_gene_id) if downstream_gene_num != '': upstream_gene_id = 'ENSEMBL:' + upstream_gene_num graph.addTriple(snp_id, self.globaltt['is downstream of sequence of'], upstream_gene_id)
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'ctd') self.dataset = Dataset( 'ctd', 'CTD', 'http://ctdbase.org', None, 'http://ctdbase.org/about/legal.jsp') if 'test_ids' not in config.get_config() \ or 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") self.test_geneids = [] else: self.test_geneids = config.get_config()['test_ids']['gene'] if 'test_ids' not in config.get_config() \ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_diseaseids = [] else: self.test_diseaseids = config.get_config()['test_ids']['disease'] self.g = self.graph self.geno = Genotype(self.graph) self.pathway = Pathway(self.graph) return
def _build_gene_disease_model(self, gene_id, relation_id, disease_id, variant_label, consequence_predicate=None, consequence_id=None, allelic_requirement=None, pmids=None): """ Builds gene variant disease model :return: None """ model = Model(self.graph) geno = Genotype(self.graph) pmids = [] if pmids is None else pmids is_variant = False variant_or_gene = gene_id variant_id_string = variant_label variant_bnode = self.make_id(variant_id_string, "_") if consequence_predicate is not None \ and consequence_id is not None: is_variant = True model.addTriple(variant_bnode, consequence_predicate, consequence_id) # Hack to add labels to terms that # don't exist in an ontology if consequence_id.startswith(':'): model.addLabel(consequence_id, consequence_id.strip(':').replace('_', ' ')) if is_variant: variant_or_gene = variant_bnode # Typically we would type the variant using the # molecular consequence, but these are not specific # enough for us to make mappings (see translation table) model.addIndividualToGraph(variant_bnode, variant_label, self.globaltt['variant_locus']) geno.addAffectedLocus(variant_bnode, gene_id) model.addBlankNodeAnnotation(variant_bnode) assoc = G2PAssoc(self.graph, self.name, variant_or_gene, disease_id, relation_id) assoc.source = pmids assoc.add_association_to_graph() if allelic_requirement is not None and is_variant is False: model.addTriple(assoc.assoc_id, self.globaltt['has_allelic_requirement'], allelic_requirement) if allelic_requirement.startswith(':'): model.addLabel( allelic_requirement, allelic_requirement.strip(':').replace('_', ' '))
def process_disease_association(self, limit): raw = '/'.join((self.rawdir, self.files['disease_assoc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing disease models") geno = Genotype(g) line_counter = 0 worm_taxon = 'NCBITaxon:6239' with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, disease_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row if self.testMode and gene_num not in self.test_ids['gene']: continue # TODO add NOT phenotypes if is_not == 'NOT': continue # WB WBGene00000001 aap-1 DOID:2583 PMID:19029536 IEA ENSEMBL:ENSG00000145675|OMIM:615214 D Y110A7A.10 gene taxon:6239 20150612 WB gene_id = 'WormBase:' + gene_num # make a variant of the gene vl = '_:' + '-'.join((gene_num, 'unspecified')) vl_label = 'some variant of ' + gene_symbol geno.addAffectedLocus(vl, gene_id) model.addBlankNodeAnnotation(vl) animal_id = geno.make_experimental_model_with_genotype( vl, vl_label, worm_taxon, 'worm') assoc = G2PAssoc(g, self.name, animal_id, disease_id, model.object_properties['model_of']) ref = re.sub(r'WB_REF:', 'WormBase:', ref) if ref != '': assoc.add_source(ref) eco_id = None if eco_symbol == 'IEA': eco_id = 'ECO:0000501' # IEA is this now if eco_id is not None: assoc.add_evidence(eco_id) assoc.add_association_to_graph() return
def setUp(self): self.graph = RDFGraph() self.curie_map = curie_map.get() self.genotype = Genotype(self.graph) self.cutil = CurieUtil(self.curie_map) self.test_cat_pred = self.cutil.get_uri(blv.terms['category']) self.test_cat_genotype_category = self.cutil.get_uri( blv.terms['Genotype']) self.test_cat_background_category = self.cutil.get_uri( blv.terms['PopulationOfIndividualOrganisms'])
def parse(self, limit=None): """ :param limit: :return: """ if limit is not None: logger.info("Only parsing first %s rows fo each file", str(limit)) logger.info("Parsing files...") if self.testOnly: self.testMode = True g = self.testgraph else: g = self.graph tmap = '/'.join((self.rawdir, self.files['trait_mappings']['file'])) self._process_trait_mappings(tmap, limit) geno = Genotype(g) # organisms = ['chicken'] organisms = [ 'chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle'] for o in organisms: tax_id = self._get_tax_by_common_name(o) geno.addGenome(tax_id, o) build_id = None build = None k = o + '_bp' if k in self.files: file = self.files[k]['file'] m = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', file) if m is None: logger.error("Can't match a gff build") else: build = m.group(1) build_id = self._map_build_by_abbrev(build) logger.info("Build = %s", build_id) geno.addReferenceGenome(build_id, build, tax_id) if build_id is not None: self._process_QTLs_genomic_location( '/'.join((self.rawdir, file)), tax_id, build_id, build, limit) k = o+'_cm' if k in self.files: file = self.files[k]['file'] self._process_QTLs_genetic_location( '/'.join((self.rawdir, file)), tax_id, o, limit) logger.info("Finished parsing") return
def _make_pheno_assoc(self, g, gene_id, gene_symbol, disorder_num, disorder_label, phene_key): geno = Genotype(g) model = Model(g) disorder_id = ':'.join(('OMIM', disorder_num)) rel_id = model.object_properties['has_phenotype'] # default rel_label = 'causes' if re.match(r'\[', disorder_label): rel_id = model.object_properties['is_marker_for'] rel_label = 'is a marker for' elif re.match(r'\{', disorder_label): rel_id = model.object_properties['contributes_to'] rel_label = 'contributes to' elif re.match(r'\?', disorder_label): # this is a questionable mapping! skip? rel_id = model.object_properties['contributes_to'] rel_label = 'contributes to' evidence = self._map_phene_mapping_code_to_eco(phene_key) # we actually want the association between the gene and the disease # to be via an alternate locus not the "wildtype" gene itself. # so we make an anonymous alternate locus, # and put that in the association. # but we only need to do that in the cases when it's not an NCBIGene # (as that is a sequence feature itself) if re.match(r'OMIM:', gene_id): alt_locus = '_:' + re.sub(r':', '', gene_id) + '-' + disorder_num + 'VL' alt_label = gene_symbol.strip() if alt_label is not None and alt_label != '': alt_label = \ ' '.join(('some variant of', alt_label, 'that', rel_label, disorder_label)) else: alt_label = None model.addIndividualToGraph(alt_locus, alt_label, Genotype.genoparts['variant_locus']) geno.addAffectedLocus(alt_locus, gene_id) model.addBlankNodeAnnotation(alt_locus) else: # assume it's already been added alt_locus = gene_id assoc = G2PAssoc(g, self.name, alt_locus, disorder_id, rel_id) assoc.add_evidence(evidence) assoc.add_association_to_graph() return
def _process_all(self, limit): """ This takes the list of omim identifiers from the omimTitles file, excludes those designated as obsolete and iteratively queries the omim api in batches of 20 for the json-formatted data. This will create OMIM classes, with the label & definition. If an entry is "removed", it is added as a deprecated class. If an entry is "moved", it is deprecated and consider annotations are added. Additionally, we extract: *phenotypicSeries ids as superclasses *equivalent ids for Orphanet and UMLS If set to testMode, it will write only those items in the test_ids to the testgraph. :param limit: """ omimids = list(self.omim_type.keys() - self.omim_replaced.keys()) LOG.info('Have %i omim numbers to fetch records from their API', len(omimids)) LOG.info('Have %i omim types ', len(self.omim_type)) if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) tax_label = 'H**o sapiens' tax_id = self.globaltt[tax_label] # add genome and taxon geno.addGenome(tax_id, tax_label) model.addClassToGraph(tax_id, tax_label) includes = set() includes.add('all') self.process_entries(omimids, self._transform_entry, includes, graph, limit) # since we are not fetching obsolete records any more add them all in here for omim_id in self.omim_replaced: model.addDeprecatedClass( 'OMIM:' + omim_id, ['OMIM:' + o for o in self.omim_replaced[omim_id]])
def _process_gene_row(self, row): model = Model(self.graph) geno = Genotype(self.graph) if self.test_mode and row['gene_id'] not in self.test_ids['gene']: return gene_id = 'NCBIGene:' + str(row['gene_id']) self.id_hash['gene'][row['gene_id']] = gene_id gene_label = row['symbol'] self.label_hash[gene_id] = gene_label tax_id = 'NCBITaxon:' + str(row['gb_species_id']) if row['gene_type'] is not None: gene_type_id = self.resolve(row['gene_type']) model.addClassToGraph(gene_id, gene_label, gene_type_id) geno.addTaxon(tax_id, gene_id)
def process_gene_ids(self, limit): src_key = 'gene_ids' raw = '/'.join((self.rawdir, self.files[src_key]['file'])) graph = self.graph model = Model(graph) geno = Genotype(graph) col = self.files[src_key]['columns'] LOG.info("Processing: %s", self.files[src_key]['file']) with gzip.open(raw, 'rb') as csvfile: reader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter=',', quotechar='\"') # no header row to check collen = len(col) for row in reader: if len(row) != collen: LOG.error('In %s line %i expected %i colums but got %s.', self.files[src_key]['file'], reader.line_num, collen, row) pass taxon_num = row[col.index('taxon_num')] gene_num = row[col.index('gene_num')] gene_symbol = row[col.index('gene_symbol')] gene_synonym = row[col.index('gene_synonym')] live = row[col.index('live')] # gene_type = row[col.index('gene_type')] # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene taxon_curie = 'NCBITaxon:' + taxon_num gene_curie = 'WormBase:' + gene_num if gene_symbol == '': gene_symbol = gene_synonym # these are not the same in my book tec. if gene_symbol == '': gene_symbol = None model.addClassToGraph(gene_curie, gene_symbol, self.globaltt['gene']) if live == 'Dead': model.addDeprecatedClass(gene_curie, old_id_category=blv.terms['Gene']) geno.addTaxon(taxon_curie, gene_curie) if gene_synonym is not None and gene_synonym != '': model.addSynonym(gene_curie, gene_synonym) if limit is not None and reader.line_num > limit: break
def _add_variant_gene_relationship(self, variant_id, hgnc_symbol): """ :param variant_id :param hgnc_symbol :return: None """ gu = GraphUtils(curie_map.get()) geno = Genotype(self.graph) if hgnc_symbol in self.gene_map: gene_id = self.gene_map[hgnc_symbol] else: gene_id = self.make_cgd_id("{0}{1}".format(variant_id, hgnc_symbol)) logger.warn("Can't map gene symbol {0} " "to entrez ID".format(hgnc_symbol)) gu.addClassToGraph(self.graph, gene_id, hgnc_symbol) geno.addAlleleOfGene(variant_id, gene_id) return
def _process_all(self, limit): """ This takes the list of omim identifiers from the omim.txt.Z file, and iteratively queries the omim api for the json-formatted data. This will create OMIM classes, with the label, definition, and some synonyms. If an entry is "removed", it is added as a deprecated class. If an entry is "moved", it is deprecated and consider annotations are added. Additionally, we extract: *phenotypicSeries ids as superclasses *equivalent ids for Orphanet and UMLS If set to testMode, it will write only those items in the test_ids to the testgraph. :param limit: :return: """ omimids = self._get_omim_ids() # store the set of omim identifiers if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) # tax_num = '9606' # TODO PYLINT unused tax_id = 'NCBITaxon:9606' tax_label = 'Human' # add genome and taxon geno.addGenome(tax_id, tax_label) # tax label can get added elsewhere model.addClassToGraph(tax_id, None) # label added elsewhere includes = set() includes.add('all') self.process_entries( omimids, self._transform_entry, includes, g, limit) return
def process_gene_ids(self, limit): raw = '/'.join((self.rawdir, self.files['gene_ids']['file'])) if self.testMode: graph = self.testgraph else: graph = self.graph model = Model(graph) logger.info("Processing: %s", self.files['gene_ids']['file']) line_counter = 0 geno = Genotype(graph) with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 (taxon_num, gene_num, gene_symbol, gene_synonym, live, gene_type) = row # 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene if self.testMode and gene_num not in self.test_ids['gene']: continue taxon_id = 'NCBITaxon:' + taxon_num gene_id = 'WormBase:' + gene_num if gene_symbol == '': gene_symbol = gene_synonym if gene_symbol == '': gene_symbol = None model.addClassToGraph(gene_id, gene_symbol, self.globaltt['gene']) if live == 'Dead': model.addDeprecatedClass(gene_id) geno.addTaxon(taxon_id, gene_id) if gene_synonym != '' and gene_synonym is not None: model.addSynonym(gene_id, gene_synonym) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _create_genome_builds(self): """ Various resources will map variations to either UCSC (hg*) or to NCBI assemblies. Here we create the equivalences between them. Data taken from: https://genome.ucsc.edu/FAQ/FAQreleases.html#release1 :return: """ # TODO add more species graph = self.graph geno = Genotype(graph) model = Model(graph) logger.info("Adding equivalent assembly identifiers") for sp in self.species: tax_id = self.resolve(sp) txid_num = tax_id.split(':')[1] for key in self.files[txid_num]['assembly']: ucsc_id = key try: ucsc_label = ucsc_id.split(':')[1] except IndexError: logger.error('%s Assembly id: "%s" is problematic', sp, key) continue if key in self.localtt: mapped_id = self.localtt[key] else: logger.error( '%s Assembly id: "%s" is not in local translation table', sp, key) mapped_label = mapped_id.split(':')[1] mapped_label = 'NCBI build ' + str(mapped_label) geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id) geno.addReferenceGenome(mapped_id, mapped_label, tax_id) model.addSameIndividual(ucsc_id, mapped_id) return
def parse(self, limit=None): if limit is not None: LOG.info("Only parsing first %s rows", limit) LOG.info("Parsing files...") if self.test_only: self.test_mode = True self.graph = self.testgraph else: self.graph = self.graph self.geno = Genotype(self.graph) # rare disease-phenotype associations self._process_ddg2p_annotations(limit) LOG.info("Finished parsing.") return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'decipher') self.dataset = Dataset( 'decipher', 'Development Disorder Genotype – Phenotype Database', 'https://decipher.sanger.ac.uk/', None, 'https://decipher.sanger.ac.uk/legal') if 'test_ids' not in config.get_config() \ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = [] else: self.test_ids = config.get_config()['test_ids']['disease'] self.g = self.graph self.geno = Genotype(self.g) self.model = Model(self.g) return
def _parse_genepage2gene(self, limit) -> Dict[str, List[str]]: """ :return: """ src_key = 'genepage2gene' columns = self.files[src_key]['columns'] raw = '/'.join((self.rawdir, self.files[src_key]['file'])) geno = Genotype(self.graph) genepage2gene = {} LOG.info("Processing GenePage to Gene file") with open(raw, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter='\t') for row in reader: gene_page = row[columns.index('gene_page_id')] # gene_page_label = row[columns.index('gene_page_label')] tropicalis_id = row[columns.index('tropicalis_id')] tropicalis_label = row[columns.index('tropicalis_label')] laevis_l_id = row[columns.index('laevis_l_id')] laevis_l_label = row[columns.index('laevis_l_label')] laevis_s_id = row[columns.index('laevis_s_id')] laevis_s_label = row[columns.index('laevis_s_label')] tropicalis_curie = 'Xenbase:' + tropicalis_id laevis_l_curie = 'Xenbase:' + laevis_l_id laevis_s_curie = 'Xenbase:' + laevis_s_id genepage2gene[gene_page] = [tropicalis_curie, laevis_l_curie, laevis_s_curie] geno.addGene(tropicalis_curie, tropicalis_label) geno.addGene(laevis_l_curie, laevis_l_label) geno.addGene(laevis_s_curie, laevis_s_label) if not self.test_mode and limit is not None and reader.line_num > limit: break return genepage2gene
def parse(self, limit=None): """ Override Source.parse() Parses version and interaction information from CTD Args: :param limit (int, optional) limit the number of rows processed Returns: :return None """ if limit is not None: logger.info("Only parsing first %d rows", limit) logger.info("Parsing files...") # pub_map = dict() # file_path = '/'.join((self.rawdir, # self.static_files['publications']['file'])) # if os.path.exists(file_path) is True: # pub_map = self._parse_publication_file( # self.static_files['publications']['file'] # ) if self.testOnly: self.testMode = True if self.testMode: self.g = self.testgraph else: self.g = self.graph self.geno = Genotype(self.g) self.pathway = Pathway(self.g) self._parse_ctd_file( limit, self.files['chemical_disease_interactions']['file']) self._parse_ctd_file(limit, self.files['gene_pathway']['file']) self._parse_ctd_file(limit, self.files['gene_disease']['file']) self._parse_curated_chem_disease(limit) logger.info("Done parsing files.") return
def parse(self, limit=None): """ Override Source.parse() Parses version and interaction information from CTD Args: :param limit (int, optional) limit the number of rows processed Returns: :return None """ if limit is not None: LOG.info("Only parsing first %d rows", limit) LOG.info("Parsing files...") if self.test_only: self.test_mode = True self.geno = Genotype(self.graph) self.pathway = Pathway(self.graph) src_key = 'chemical_disease_associations' self._parse_ctd_file(limit, src_key)
def _add_gene_anatomy_association(self, gene_id, anatomy_curie, rank): """ :param gene_id: str Non curified ID :param gene_label: str Gene symbol :param anatomy_curie: str curified anatomy term :param rank: str rank :return: None """ g2a_association = Assoc(self.graph, self.name) genotype = Genotype(self.graph) model = Model(self.graph) gene_curie = "ENSEMBL:{}".format(gene_id) rank = re.sub(r',', '', rank) model.addIndividualToGraph(ind_id=gene_curie, label=None, ind_type=genotype.genoparts['gene']) g2a_association.sub = gene_curie g2a_association.obj = anatomy_curie g2a_association.rel = Assoc.object_properties['expressed_in'] g2a_association.add_association_to_graph() g2a_association.add_predicate_object( Assoc.datatype_properties['has_quantifier'], float(rank), 'Literal', 'xsd:float') return
def _add_g2p_assoc(self, graph, strain_id, sex, assay_id, phenotypes, comment): """ Create an association between a sex-specific strain id and each of the phenotypes. Here, we create a genotype from the strain, and a sex-specific genotype. Each of those genotypes are created as anonymous nodes. The evidence code is hardcoded to be: ECO:experimental_phenotypic_evidence. :param g: :param strain_id: :param sex: :param assay_id: :param phenotypes: a list of phenotypes to association with the strain :param comment: :return: """ geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['experimental phenotypic evidence'] strain_label = self.idlabel_hash.get(strain_id) # strain genotype genotype_id = '_' + '-'.join((re.sub(r':', '', strain_id), 'genotype')) genotype_label = '[' + strain_label + ']' sex_specific_genotype_id = '_' + '-'.join( (re.sub(r':', '', strain_id), sex, 'genotype')) if strain_label is not None: sex_specific_genotype_label = strain_label + ' (' + sex + ')' else: sex_specific_genotype_label = strain_id + '(' + sex + ')' genotype_type = self.globaltt['sex_qualified_genotype'] if sex == 'm': genotype_type = self.globaltt['male_genotype'] elif sex == 'f': genotype_type = self.globaltt['female_genotype'] # add the genotype to strain connection geno.addGenotype(genotype_id, genotype_label, self.globaltt['genomic_background']) graph.addTriple(strain_id, self.globaltt['has_genotype'], genotype_id) geno.addGenotype(sex_specific_genotype_id, sex_specific_genotype_label, genotype_type) # add the strain as the background for the genotype graph.addTriple(sex_specific_genotype_id, self.globaltt['has_sex_agnostic_part'], genotype_id) # ############# BUILD THE G2P ASSOC ############# # TODO add more provenance info when that model is completed if phenotypes is not None: for phenotype_id in phenotypes: assoc = G2PAssoc(graph, self.name, sex_specific_genotype_id, phenotype_id) assoc.add_evidence(assay_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph() assoc_id = assoc.get_association_id() model.addComment(assoc_id, comment) model._addSexSpecificity(assoc_id, self.resolve(sex)) return
def process_gaf(self, file, limit, id_map=None, eco_map=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing Gene Associations from %s", file) line_counter = 0 uniprot_hit = 0 uniprot_miss = 0 if '7955' in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) if '6239' in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue if len(row) > 17 or len(row) < 15: LOG.warning( "Wrong number of columns %i, expected 15 or 17\n%s", len(row), row) continue if 17 > len(row) >= 15: row += [""] * (17 - len(row)) (dbase, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (dbase == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): LOG.error( "Missing required part of annotation on row %d:\n" + '\t'.join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if dbase in self.localtt: dbase = self.localtt[dbase] uniprotid = None gene_id = None if dbase == 'UniProtKB': if id_map is not None and gene_num in id_map: gene_id = id_map[gene_num] uniprotid = ':'.join((dbase, gene_num)) (dbase, gene_num) = gene_id.split(':') uniprot_hit += 1 else: # LOG.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((dbase, gene_num)) if self.test_mode and not (re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for syn in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, syn.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon LOG.info(">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = eco_map[eco_symbol] assoc.add_evidence(eco_id) except KeyError: LOG.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if prefix == 'PMID': ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: LOG.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: LOG.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ####################################################################### # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): LOG.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) # TODO PYLINT why is this needed? # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, i, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence(self.globaltt[ 'experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.test_mode and limit is not None and line_counter > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot LOG.info( "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download", uniprot_per, uniprot_tot) return
def _process_haplotype(self, hap_id, hap_label, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) # add the feature to the graph hap_description = None if risk_allele_frequency not in ['', 'NR']: hap_description = str( risk_allele_frequency) + ' [risk allele frequency]' model.addIndividualToGraph(hap_id, hap_label.strip(), self.globaltt['haplotype'], hap_description) geno.addTaxon(self.globaltt["H**o sapiens"], hap_id) snp_labels = re.split(r';\s?', hap_label) chrom_nums = re.split(r';\s?', chrom_num) chrom_positions = re.split(r';\s?', chrom_pos) context_list = re.split(r';\s?', context) mapped_genes = re.split(r';\s?', mapped_gene) # Not having four "PAX5" as a list might be better, but it breaks unit tests # mapped_genes = list(set(mapped_genes)) # make uniq # snp_labels = list(set(snp_labels)) # make uniq snp_curies = list() for snp in snp_labels: snp_curie, snp_type = self._get_curie_and_type_from_id(snp) if snp_type is None: LOG.info('cant find type for SNP in %s', snp) # make blank node snp_curie = self.make_id(snp, "_") model.addLabel(snp_curie, snp) elif snp_curie[0] == '_': # arrived an unlabeled blanknode model.addLabel(snp_curie, snp) graph.addTriple(hap_id, self.globaltt['has_variant_part'], snp_curie) snp_curies.append(snp_curie) # courtesy http://stackoverflow.com/a/16720915 # check lengths of mutiple lists length = len(snp_curies) if not all( len(lst) == length for lst in [snp_labels, chrom_nums, chrom_positions, context_list]): LOG.warning( "Incongruous data field(s) for haplotype %s \n " "will not add snp details", hap_label) else: variant_in_gene_count = 0 for index, snp_curie in enumerate(snp_curies): self._add_snp_to_graph(snp_curie, snp_labels[index], chrom_nums[index], chrom_positions[index], context_list[index]) if mapped_genes and len(mapped_genes) != len(snp_labels): LOG.warning("More mapped genes than snps," " cannot disambiguate for\n%s\n%s", mapped_genes, snp_labels) # hap_label) else: so_class = self.resolve(context_list[index]) so_query = """ SELECT ?variant_label WHERE {{ {0} rdfs:subClassOf+ {1} ; rdfs:label ?variant_label . }} """.format(so_class, self.globaltt['gene_variant']) query_result = so_ontology.query(so_query) gene_id = DipperUtil.get_hgnc_id_from_symbol( mapped_genes[index]) if gene_id is not None and len(list(query_result)) == 1: if context_list[index] in [ 'upstream_gene_variant', 'downstream_gene_variant' ]: graph.addTriple(snp_curie, self.resolve(context_list[index]), gene_id) else: geno.addAffectedLocus(snp_curie, gene_id) variant_in_gene_count += 1 # Seperate in case we want to apply a different relation # If not this is redundant with triples added above if len(mapped_genes) == variant_in_gene_count and \ len(set(mapped_genes)) == 1: gene_id = DipperUtil.get_hgnc_id_from_symbol(mapped_genes[0]) geno.addAffectedLocus(hap_id, gene_id)
def _process_data(self, raw, limit=None): LOG.info("Processing Data from %s", raw) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) # Add the taxon as a class taxon_id = self.globaltt['Mus musculus'] model.addClassToGraph(taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: col = self.files['all']['columns'] with gzip.open(raw, 'rt') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='\"') row = next(reader) # presumed header if not self.check_fileheader(col, row): pass for row in reader: # | head -1 | tr ',' '\n' | sed "s|\(.*\)|# \1 = row[col.index('\1')]|g" marker_accession_id = row[col.index('marker_accession_id')].strip() marker_symbol = row[col.index('marker_symbol')].strip() phenotyping_center = row[col.index('phenotyping_center')].strip() colony_raw = row[col.index('colony_id')].strip() sex = row[col.index('sex')].strip() zygosity = row[col.index('zygosity')].strip() allele_accession_id = row[col.index('allele_accession_id')].strip() allele_symbol = row[col.index('allele_symbol')].strip() # allele_name = row[col.index('allele_name')] strain_accession_id = row[col.index('strain_accession_id')].strip() strain_name = row[col.index('strain_name')].strip() # project_name = row[col.index('project_name')] project_fullname = row[col.index('project_fullname')].strip() pipeline_name = row[col.index('pipeline_name')].strip() pipeline_stable_id = row[col.index('pipeline_stable_id')].strip() procedure_stable_id = row[col.index('procedure_stable_id')].strip() procedure_name = row[col.index('procedure_name')].strip() parameter_stable_id = row[col.index('parameter_stable_id')].strip() parameter_name = row[col.index('parameter_name')].strip() # top_level_mp_term_id = row[col.index('top_level_mp_term_id')] # top_level_mp_term_name = row[col.index('top_level_mp_term_name')] mp_term_id = row[col.index('mp_term_id')].strip() mp_term_name = row[col.index('mp_term_name')].strip() p_value = row[col.index('p_value')].strip() percentage_change = row[col.index('percentage_change')].strip() effect_size = row[col.index('effect_size')].strip() statistical_method = row[col.index('statistical_method')].strip() resource_name = row[col.index('resource_name')].strip() if self.test_mode and marker_accession_id not in self.gene_ids: continue # ##### cleanup some of the identifiers ###### zygosity = zygosity.strip() zygosity_id = self.resolve(zygosity) if zygosity_id == zygosity: LOG.warning( "Zygosity '%s' unmapped. detting to indeterminate", zygosity) zygosity_id = self.globaltt['indeterminate'] # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_:' + re.sub(r'\W+', '_', colony_raw) if not re.match(r'MGI', allele_accession_id): allele_accession_id = '_:IMPC-'+re.sub( r':', '', allele_accession_id) if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC # TODO blank nodes do not maintain identifiers strain_accession_id = '_:' + strain_accession_id elif not re.match(r'MGI', strain_accession_id): LOG.info( "Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:'+strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = re.match( r'.*<(.*)>', allele_symbol) if sequence_alteration_name is not None: sequence_alteration_name = sequence_alteration_name.group(1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and marker_accession_id == '': LOG.warning("Marker unspecified on row %d", reader.line_num) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = self.globaltt['variant_locus'] geno.addGene( marker_accession_id, marker_symbol, self.globaltt['gene']) geno.addAllele( variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) # TAG bnode sequence_alteration_id = '_:seqalt' + re.sub( r':', '', allele_accession_id) geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration( sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, with unknown zygosity stem_cell_class = self.globaltt['embryonic stem cell line'] if colony_id is None: print(colony_raw, stem_cell_class, "\nline:\t", reader.line_num) model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = '_:'+re.sub( r':', '', allele_accession_id + self.globaltt['indeterminate']) vslc_colony_label = allele_symbol + '/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts( allele_accession_id, colony_genotype_id, self.globaltt['has_variant_part']) geno.addPartsToVSLC( vslc_colony, allele_accession_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) graph.addTriple( colony_id, self.globaltt['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = self.globaltt['has_variant_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: LOG.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '-'.join( (marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:'+vslc_id model.addIndividualToGraph( vslc_id, vslc_name, self.globaltt['variant single locus complement']) geno.addPartsToVSLC( vslc_id, allele1_id, allele2_id, zygosity_id, self.globaltt['has_variant_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc model.addType(vslc_id, self.globaltt['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype( genomic_background_id, strain_name, self.globaltt['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = strain_name + '-' + phenotyping_center \ + '-' + colony_raw pheno_center_strain_id = '-'.join(( re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center), re.sub(r'\W+', '', colony_raw))) if not re.match(r'^_', pheno_center_strain_id): # Tag bnode pheno_center_strain_id = '_:' + pheno_center_strain_id geno.addGenotype( pheno_center_strain_id, pheno_center_strain_label, self.globaltt['genomic_background']) geno.addSequenceDerivesFrom( pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name + ' [' + pheno_center_strain_label + ']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(taxon_id, pheno_center_strain_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id(( colony_id + phenotyping_center + zygosity + strain_accession_id + sex)) sex_qualified_genotype_label = genotype_name + ' (' + sex + ')' sq_type_id = self.resolve(sex, False) if sq_type_id == sex: sq_type_id = self.globaltt['intrinsic_genotype'] LOG.warning( "Unknown sex qualifier %s, adding as intrinsic_genotype", sex) geno.addGenotype( sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts( genotype_id, sex_qualified_genotype_id, self.globaltt['has_variant_part']) if genomic_background_id is not None and genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender # sometimes phenotype ids are missing. (about 711 early 2020) if mp_term_id is None or mp_term_id == '': LOG.warning( "No phenotype id specified for row %d", reader.line_num) continue # hard coded ECO code eco_id = self.globaltt['mutant phenotype evidence'] # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc( graph, self.name, sex_qualified_genotype_id, mp_term_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph() assoc_id = assoc.get_association_id() model._addSexSpecificity(assoc_id, self.resolve(sex)) # add a free-text description try: description = ' '.join(( mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) except ValueError: description = ' '.join(( mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(effect_size), '(p =', "{0}".format(p_value), ').')) study_bnode = self._add_study_provenance( phenotyping_center, colony_raw, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name) evidence_line_bnode = self._add_evidence( assoc_id, eco_id, p_value, percentage_change, effect_size, study_bnode) self._add_assertion_provenance(assoc_id, evidence_line_bnode) model.addDescription(evidence_line_bnode, description) # resource_id = resource_name # assoc.addSource(graph, assoc_id, resource_id) if not self.test_mode and limit is not None and reader.line_num > limit: break