def _parse_genepage2gene(self, limit) -> Dict[str, List[str]]: """ :return: """ src_key = 'genepage2gene' columns = self.files[src_key]['columns'] raw = '/'.join((self.rawdir, self.files[src_key]['file'])) geno = Genotype(self.graph) genepage2gene = {} LOG.info("Processing GenePage to Gene file") with open(raw, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter='\t') for row in reader: gene_page = row[columns.index('gene_page_id')] # gene_page_label = row[columns.index('gene_page_label')] tropicalis_id = row[columns.index('tropicalis_id')] tropicalis_label = row[columns.index('tropicalis_label')] laevis_l_id = row[columns.index('laevis_l_id')] laevis_l_label = row[columns.index('laevis_l_label')] laevis_s_id = row[columns.index('laevis_s_id')] laevis_s_label = row[columns.index('laevis_s_label')] tropicalis_curie = 'Xenbase:' + tropicalis_id laevis_l_curie = 'Xenbase:' + laevis_l_id laevis_s_curie = 'Xenbase:' + laevis_s_id genepage2gene[gene_page] = [tropicalis_curie, laevis_l_curie, laevis_s_curie] geno.addGene(tropicalis_curie, tropicalis_label) geno.addGene(laevis_l_curie, laevis_l_label) geno.addGene(laevis_s_curie, laevis_s_label) if not self.test_mode and limit is not None and reader.line_num > limit: break return genepage2gene
def _process_qtls_genetic_location( self, raw, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ aql_curie = self.files[common_name + '_cm']['curie'] if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm, flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model_id, test_base, sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio, trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row if self.test_mode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = ':'.join((aql_curie, trait_id.strip())) # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:'+common_name.strip()+'-linkage' build_label = common_name+' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm)] else: LOG.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:'+peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref(qtl_id, dbsnp_id) gene_id = gene_id.replace('uncharacterized ', '').strip() if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error( ' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = '_:' + re.sub( r':', '', gene_id) + '-' + peak_mark.strip() geno.addSequenceAlterationToVariantLocus( dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph(trait_id, trait_name) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:'+pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() if not self.test_mode and limit is not None and line_counter > limit: break LOG.info("Done with QTL genetic info") return
def _process_qtls_genetic_location( self, raw, src_key, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ aql_curie = self.files[src_key]['curie'] common_name = common_name.strip() if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # no header in these files, so no header checking col = self.files[src_key]['columns'] col_len = len(col) for row in reader: if len(row) != col_len and ''.join(row[col_len:]) != '': LOG.warning( "Problem parsing %s line %i containing: \n%s\n" "got %i cols but expected %i", raw, reader.line_num, row, len(row), col_len) # LOG.info(row) continue qtl_id = row[col.index('QTL_ID')].strip() qtl_symbol = row[col.index('QTL_symbol')].strip() trait_name = row[col.index('Trait_name')].strip() # assotype = row[col.index('assotype')].strip() chromosome = row[col.index('Chromosome')].strip() position_cm = row[col.index('Position_cm')].strip() range_cm = row[col.index('range_cm')].strip() # flankmark_a2 = row[col.index('FlankMark_A2')].strip() # flankmark_a1 = row[col.index('FlankMark_A1')].strip() peak_mark = row[col.index('Peak_Mark')].strip() # flankmark_b1 = row[col.index('FlankMark_B1')].strip() # flankmark_b2 = row[col.index('FlankMark_B2')].strip() # exp_id = row[col.index('Exp_ID')].strip() # model_id = row[col.index('Model')].strip() # test_base = row[col.index('testbase')].strip() # sig_level = row[col.index('siglevel')].strip() # lod_score = row[col.index('LOD_score')].strip() # ls_mean = row[col.index('LS_mean')].strip() p_values = row[col.index('P_values')].strip() # f_statistics = row[col.index('F_Statistics')].strip() # variance = row[col.index('VARIANCE')].strip() # bayes_value = row[col.index('Bayes_value')].strip() # likelihood_ratio = row[col.index('LikelihoodR')].strip() trait_id = row[col.index('TRAIT_ID')].strip() # dom_effect = row[col.index('Dom_effect')].strip() # add_effect = row[col.index('Add_effect')].strip() pubmed_id = row[col.index('PUBMED_ID')].strip() gene_id = row[col.index('geneID')].strip() gene_id_src = row[col.index('geneIDsrc')].strip() # gene_id_type = row[col.index('geneIDtype')].strip() if self.test_mode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = ':'.join((aql_curie, trait_id.strip())) # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:' + common_name + '-linkage' build_label = common_name + ' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm)] else: LOG.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:' + peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref( qtl_id, dbsnp_id, xref_category=blv.terms['SequenceVariant']) gene_id = gene_id.replace('uncharacterized ', '').strip() gene_id = gene_id.strip(',') # for "100157483," in pig_QTLdata.txt if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error( ' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id as a bnode vl_id = self.make_id(re.sub( r':', '', gene_id) + '-' + peak_mark.strip(), '_') geno.addSequenceAlterationToVariantLocus(dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph( trait_id, trait_name, class_category=blv.terms['PhenotypicFeature']) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:' + pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # off by one - the following actually gives us (limit + 1) records if not self.test_mode and limit is not None and reader.line_num > limit: break LOG.info("Done with QTL genetic info")
def _parse_g2p_file(self, limit=None): """ Parse gene to XPO file, currently custom for Monarch :param limit: :return: """ src_key = 'g2p_assertions' geno = Genotype(self.graph) model = Model(self.graph) columns = self.files[src_key]['columns'] raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing Gene to XPO associations") with open(raw, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile) # File has headers row = next(reader) if not self.check_fileheader(columns, row): pass for row in reader: gene = row[columns.index('SUBJECT')] gene_label = row[columns.index('SUBJECT_LABEL')] gene_taxon = row[columns.index('SUBJECT_TAXON')] #gene_taxon_label = row[columns.index('SUBJECT_TAXON_LABEL')] phenotype_curie = row[columns.index('OBJECT')] #phenotype_label = row[columns.index('OBJECT_LABEL')] relation = row[columns.index('RELATION')] #relation_label = row[columns.index('RELATION_LABEL')] evidence = row[columns.index('EVIDENCE')] #evidence_label = row[columns.index('EVIDENCE_LABEL')] source = row[columns.index('SOURCE')] #is_defined_by = row[columns.index('IS_DEFINED_BY')] #qualifier = row[columns.index('QUALIFIER')] gene_curie = 'Xenbase:' + gene relation_curie = relation.replace('_', ':') geno.addGene(gene_curie, gene_label) geno.addTaxon(gene_taxon, gene_curie) assoc = G2PAssoc( self.graph, self.name, entity_id=gene_curie, phenotype_id=phenotype_curie, rel=relation_curie ) if evidence: assoc.add_evidence(evidence) if source: model.addType(source, self.globaltt['journal article']) assoc.add_source(source) assoc.add_association_to_graph() if not self.test_mode and limit is not None and reader.line_num > limit: break
def _process_allele_gene(self, limit): """ Make associations between an allele and a gene Adds triples to self.graph Approach is to use the label nomenclature and species map to determine taxon. Foreign Transgenes are filtered out. :param limit: number of rows to process :return: None """ geno = Genotype(self.graph) species_map = self._species_to_ncbi_tax() src_key = 'allele_gene' raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("processing allele to gene") col = self.files[src_key]['columns'] with gzip.open(raw, 'rt') as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') # skip first line, version info next(reader) row = next(reader) # headers # header line starts with a hash and tab ?? row = row[1:] self.check_fileheader(col, row) for row in reader: allele_id = row[col.index('AlleleID')] allele_label = row[col.index('AlleleSymbol')] gene_id = row[col.index('GeneID')] gene_label = row[col.index('GeneSymbol')] allele_curie = 'FlyBase:' + allele_id gene_curie = 'FlyBase:' + gene_id # Add Allele and taxon, skip anything that's not drosophila allele_prefix = re.findall(r'^(\w*)\\', allele_label) if len(allele_prefix) == 1: try: if species_map[allele_prefix[0]][0] == 'drosophilid': geno.addAllele(allele_curie, allele_label) geno.addTaxon(species_map[allele_prefix[0]][1], allele_curie) else: # If it's a foreign transgenic allele, skip continue except KeyError: LOG.info("%s not in species prefix file", allele_prefix[0]) note = ''' list of unincluded species prefixes include: Aace,Afun,Agos,Ahyp,Amil,Aobl,Apim,Apol,Aque,Asam,AspBV3L6, Avin,Baen,Bant,Bcen,Bdor,Beme,Besp,Bger,Blan,Bovi,Brsp, Bsp240B1,Bsub,Btab,Bter,Bxb1,BYV,CABYV,Cbeta,Ccaj,Cdif, Cfum,Cgri,Cint,Clsp,Cmar,Cnoc,Cpip,Cprd,Cqui,Crub,Csal, CsIV,D6,Dano,Dcaa,Dcol,Dcub,Ddun,DENV,Dflo,Dful,Dmas,Dnep, Drad,Ecab,Efae,Egra,Epos,Equa,EspSC22,Fmer,Gfas,Gint,Gmax, Gmor,Gthe,gypsy,Harm,hobo,HPV18,Hpyl,Hsod,HspTP009,Htur, Hver,Isca,jockey,Klac,Kpne,Lcup,Ldis,Lhem,Lmal,Lmon,Lser, Mani,Mbre,Mosp,Mper,Mril,NDV,Nlug,Npha,Nvec,Nvit,Oari, Obic,Osat,Paer,Pchi,PCV,Penelope,Pgur,Phum,Pime,Pmat,Pshi, Pvin,PVX,Pxyl,Rfla,Rhsp,Rpal,Rsph,Shel,Slit,Soce,Spou, Spyo,Tadh,TBSV,TCV,TEV,Tgeo,Tgon,Tmer,TNPV,TspX513,Tthe, Vcon,Vdes,Vpar,VV,WSSV,Xvas,Zbai,Zbis,ZIKV,Zrou,ZYMV ''' continue elif not allele_prefix: geno.addAllele(allele_curie, allele_label) geno.addTaxon(self.globaltt['Drosophila melanogaster'], allele_curie) else: raise ValueError( "Did not correctly parse allele label {}".format( allele_label)) # Process genes gene_prefix = re.findall(r'^(\w*)\\', gene_label) if len(gene_prefix) == 1: try: geno.addTaxon(species_map[gene_prefix[0]][1], gene_curie) if species_map[gene_prefix[0]][0] == 'drosophilid': geno.addGene(gene_curie, gene_label) else: # Don't create labels for non drosophila genes geno.addGene(gene_curie) except KeyError: LOG.info("%s not in species prefix file", gene_prefix[0]) geno.addGene(gene_curie) elif not gene_prefix: geno.addGene(gene_curie, gene_label) geno.addTaxon(self.globaltt['Drosophila melanogaster'], allele_curie) else: raise ValueError( "Did not correct parse gene label {}".format( gene_label)) # Connect allele and gene with geno.addAffectedLocus() if allele_prefix and gene_prefix: if allele_prefix[0] == gene_prefix[0]: geno.addAffectedLocus(allele_curie, gene_curie) else: raise ValueError( "Found allele and gene with different " "prefixes: {}, {}".format(allele_id, gene_id)) elif not allele_prefix and gene_prefix: raise ValueError("Found allele and gene with different " "prefixes: {}, {}".format( allele_id, gene_id)) else: # Both are melanogaster geno.addAffectedLocus(allele_curie, gene_curie) if limit is not None and reader.line_num > limit: break
def _process_phenotype_data(self, limit): """ NOTE: If a Strain carries more than one mutation, then each Mutation description, i.e., the set: ( Mutation Type - Chromosome - Gene Symbol - Gene Name - Allele Symbol - Allele Name) will require a separate line. Note that MMRRC curates phenotypes to alleles, even though they distribute only one file with the phenotypes appearing to be associated with a strain. So, here we process the allele-to-phenotype relationships separately from the strain-to-allele relationships. :param limit: :return: """ src_key = 'catalog' if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) fname = '/'.join((self.rawdir, self.files[src_key]['file'])) self.strain_hash = {} self.id_label_hash = {} genes_with_no_ids = set() stem_cell_class = self.globaltt['stem cell'] mouse_taxon = self.globaltt['Mus musculus'] geno = Genotype(graph) with open(fname, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='\"') # This MMRRC catalog data file was generated on YYYY-MM-DD # insert or check date w/dataset line = next(reader) # gen_date = line[-10:] line = next(reader) col = self.files['catalog']['columns'] if col != line: LOG.error( '%s\nExpected Headers:\t%s\nRecived Headers:\t%s\n', src_key, col, line) LOG.info(set(col) - set(line)) line = next(reader) if line != []: LOG.warning('Expected third line to be blank. got "%s" instead', line) for row in reader: strain_id = row[col.index('STRAIN/STOCK_ID')].strip() strain_label = row[col.index('STRAIN/STOCK_DESIGNATION')] # strain_type_symbol = row[col.index('STRAIN_TYPE')] strain_state = row[col.index('STATE')] mgi_allele_id = row[col.index('MGI_ALLELE_ACCESSION_ID')].strip() mgi_allele_symbol = row[col.index('ALLELE_SYMBOL')] # mgi_allele_name = row[col.index('ALLELE_NAME')] # mutation_type = row[col.index('MUTATION_TYPE')] # chrom = row[col.index('CHROMOSOME')] mgi_gene_id = row[col.index('MGI_GENE_ACCESSION_ID')].strip() mgi_gene_symbol = row[col.index('GENE_SYMBOL')].strip() mgi_gene_name = row[col.index('GENE_NAME')] # sds_url = row[col.index('SDS_URL')] # accepted_date = row[col.index('ACCEPTED_DATE')] mpt_ids = row[col.index('MPT_IDS')].strip() pubmed_nums = row[col.index('PUBMED_IDS')].strip() research_areas = row[col.index('RESEARCH_AREAS')].strip() if self.test_mode and (strain_id not in self.test_ids) \ or mgi_gene_name == 'withdrawn': continue # strip off stuff after the dash - # is the holding center important? # MMRRC:00001-UNC --> MMRRC:00001 strain_id = re.sub(r'-\w+$', '', strain_id) self.id_label_hash[strain_id] = strain_label # get the variant or gene to save for later building of # the genotype if strain_id not in self.strain_hash: self.strain_hash[strain_id] = { 'variants': set(), 'genes': set()} # flag bad ones if mgi_allele_id[:4] != 'MGI:' and mgi_allele_id != '': LOG.error("Erroneous MGI allele id: %s", mgi_allele_id) if mgi_allele_id[:3] == 'MG:': mgi_allele_id = 'MGI:' + mgi_allele_id[3:] else: mgi_allele_id = '' if mgi_allele_id != '': self.strain_hash[strain_id]['variants'].add(mgi_allele_id) self.id_label_hash[mgi_allele_id] = mgi_allele_symbol # use the following if needing to add the sequence alteration types # var_type = self.localtt[mutation_type] # make a sequence alteration for this variant locus, # and link the variation type to it # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA' # if self.nobnodes: # sa_id = ':'+sa_id # gu.addIndividualToGraph(g, sa_id, None, var_type) # geno.addSequenceAlterationToVariantLocus(sa_id, mgi_allele_id) # scrub out any spaces, fix known issues mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id) if mgi_gene_id == 'NULL': mgi_gene_id = '' elif mgi_gene_id[:7] == 'GeneID:': mgi_gene_id = 'NCBIGene:' + mgi_gene_id[7:] if mgi_gene_id != '': [curie, localid] = mgi_gene_id.split(':') if curie not in ['MGI', 'NCBIGene']: LOG.info("MGI Gene id not recognized: %s", mgi_gene_id) self.strain_hash[strain_id]['genes'].add(mgi_gene_id) self.id_label_hash[mgi_gene_id] = mgi_gene_symbol # catch some errors - too many. report summary at the end # some things have gene labels, but no identifiers - report if mgi_gene_symbol != '' and mgi_gene_id == '': # LOG.error( # "Gene label with no MGI identifier for strain %s: %s", # strain_id, mgi_gene_symbol) genes_with_no_ids.add(mgi_gene_symbol) # make a temp id for genes that aren't identified ... err wow. # tmp_gene_id = '_' + mgi_gene_symbol # self.id_label_hash[tmp_gene_id.strip()] = mgi_gene_symbol # self.strain_hash[strain_id]['genes'].add(tmp_gene_id) # split apart the mp ids # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ... # mpt_ids are a comma delimited list # labels with MP terms following in brackets phenotype_ids = [] if mpt_ids != '': for lb_mp in mpt_ids.split(r','): lb_mp = lb_mp.strip() if lb_mp[-1:] == ']' and lb_mp[-12:-8] == '[MP:': phenotype_ids.append(lb_mp[-11:-2]) # pubmed ids are space delimited pubmed_ids = [] if pubmed_nums != '': for pm_num in re.split(r'\s+', pubmed_nums): pmid = 'PMID:' + pm_num.strip() pubmed_ids.append(pmid) ref = Reference(graph, pmid, self.globaltt['journal article']) ref.addRefToGraph() # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001 # is a good example of 4 genotype parts model.addClassToGraph(mouse_taxon, None) if research_areas == '': research_areas = None else: research_areas = 'Research Areas: ' + research_areas strain_type = mouse_taxon if strain_state == 'ES': strain_type = stem_cell_class model.addIndividualToGraph( # an inst of mouse?? strain_id, strain_label, strain_type, research_areas) model.makeLeader(strain_id) # phenotypes are associated with the alleles for pid in phenotype_ids: # assume the phenotype label is in some ontology model.addClassToGraph(pid, None) if mgi_allele_id is not None and mgi_allele_id != '': assoc = G2PAssoc( graph, self.name, mgi_allele_id, pid, self.globaltt['has phenotype']) for p in pubmed_ids: assoc.add_source(p) assoc.add_association_to_graph() else: LOG.info("Phenotypes and no allele for %s", strain_id) if not self.test_mode and ( limit is not None and reader.line_num > limit): break # now that we've collected all of the variant information, build it # we don't know their zygosities for s in self.strain_hash: h = self.strain_hash.get(s) variants = h['variants'] genes = h['genes'] vl_set = set() # make variant loci for each gene if len(variants) > 0: for var in variants: vl_id = var.strip() vl_symbol = self.id_label_hash[vl_id] geno.addAllele( vl_id, vl_symbol, self.globaltt['variant_locus']) vl_set.add(vl_id) if len(variants) == 1 and len(genes) == 1: for gene in genes: geno.addAlleleOfGene(vl_id, gene) else: geno.addAllele(vl_id, vl_symbol) else: # len(vars) == 0 # it's just anonymous variants in some gene for gene in genes: vl_id = '_:' + re.sub(r':', '', gene) + '-VL' vl_symbol = self.id_label_hash[gene]+'<?>' self.id_label_hash[vl_id] = vl_symbol geno.addAllele( vl_id, vl_symbol, self.globaltt['variant_locus']) geno.addGene(gene, self.id_label_hash[gene]) geno.addAlleleOfGene(vl_id, gene) vl_set.add(vl_id) # make the vslcs vl_list = sorted(vl_set) vslc_list = [] for vl in vl_list: # for unknown zygosity vslc_id = re.sub(r'^_', '', vl)+'U' vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:' + vslc_id vslc_label = self.id_label_hash[vl] + '/?' self.id_label_hash[vslc_id] = vslc_label vslc_list.append(vslc_id) geno.addPartsToVSLC( vslc_id, vl, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part'], None) model.addIndividualToGraph( vslc_id, vslc_label, self.globaltt['variant single locus complement']) if len(vslc_list) > 0: if len(vslc_list) > 1: gvc_id = '-'.join(vslc_list) gvc_id = re.sub(r'_|:', '', gvc_id) gvc_id = '_:'+gvc_id gvc_label = '; '.join(self.id_label_hash[v] for v in vslc_list) model.addIndividualToGraph( gvc_id, gvc_label, self.globaltt['genomic_variation_complement']) for vslc_id in vslc_list: geno.addVSLCtoParent(vslc_id, gvc_id) else: # the GVC == VSLC, so don't have to make an extra piece gvc_id = vslc_list.pop() gvc_label = self.id_label_hash[gvc_id] genotype_label = gvc_label + ' [n.s.]' bkgd_id = re.sub( r':', '', '-'.join(( self.globaltt['unspecified_genomic_background'], s))) genotype_id = '-'.join((gvc_id, bkgd_id)) bkgd_id = '_:' + bkgd_id geno.addTaxon(mouse_taxon, bkgd_id) geno.addGenomicBackground( bkgd_id, 'unspecified (' + s + ')', self.globaltt['unspecified_genomic_background'], "A placeholder for the unspecified genetic background for " + s) geno.addGenomicBackgroundToGenotype( bkgd_id, genotype_id, self.globaltt['unspecified_genomic_background']) geno.addParts( gvc_id, genotype_id, self.globaltt['has_variant_part']) geno.addGenotype(genotype_id, genotype_label) graph.addTriple( s, self.globaltt['has_genotype'], genotype_id) else: # LOG.debug( # "Strain %s is not making a proper genotype.", s) pass LOG.warning( "The following gene symbols did not list identifiers: %s", str(sorted(list(genes_with_no_ids)))) LOG.error( '%i symbols given are missing their gene identifiers', len(genes_with_no_ids)) return
def _process_data(self, raw, limit=None): logger.info("Processing Data from %s", raw) gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) line_counter = 0 gu.loadAllProperties(g) gu.loadObjectProperties(g, geno.object_properties) # Add the taxon as a class taxon_id = 'NCBITaxon:10090' # map to Mus musculus gu.addClassToGraph(g, taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: with gzip.open(raw, 'rt') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: line_counter += 1 (marker_accession_id, marker_symbol, phenotyping_center, colony, sex, zygosity, allele_accession_id, allele_symbol, allele_name, strain_accession_id, strain_name, project_name, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, top_level_mp_term_id, top_level_mp_term_name, mp_term_id, mp_term_name, p_value, percentage_change, effect_size, statistical_method, resource_name) = row if self.testMode and marker_accession_id not in self.test_ids: continue # ##### cleanup some of the identifiers ###### zygosity_id = self._map_zygosity(zygosity) # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_'+re.sub(r'\W+', '_', colony) if self.nobnodes: colony_id = ':'+colony_id if not re.match(r'MGI', allele_accession_id): allele_accession_id = \ '_IMPC-'+re.sub(r':', '', allele_accession_id) if self.nobnodes: allele_accession_id = ':'+allele_accession_id if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC strain_accession_id = '_'+strain_accession_id if self.nobnodes: strain_accession_id = ':'+strain_accession_id elif not re.match(r'MGI', strain_accession_id): logger.info( "Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:'+strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = \ re.match(r'.*<(.*)>', allele_symbol).group(1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and \ marker_accession_id == '': logger.warning( "Marker unspecified on row %d", line_counter) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = geno.genoparts['variant_locus'] geno.addGene(marker_accession_id, marker_symbol, geno.genoparts['gene']) geno.addAllele(variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) sequence_alteration_id = \ '_seqalt'+re.sub(r':', '', allele_accession_id) if self.nobnodes: sequence_alteration_id = ':'+sequence_alteration_id geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration(sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, # with unknown zygosity stem_cell_class = 'ERO:0002002' gu.addIndividualToGraph(g, colony_id, colony, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = \ '_'+allele_accession_id+geno.zygosity['indeterminate'] vslc_colony = re.sub(r':', '', vslc_colony) if self.nobnodes: vslc_colony = ':'+vslc_colony vslc_colony_label = allele_symbol+'/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts(allele_accession_id, colony_genotype_id, geno.object_properties['has_alternate_part']) geno.addPartsToVSLC( vslc_colony, allele_accession_id, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part']) gu.addTriple( g, colony_id, geno.object_properties['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = geno.object_properties['has_alternate_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: logger.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '_' + '-'.join((marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) if self.nobnodes: vslc_id = ':'+vslc_id gu.addIndividualToGraph( g, vslc_id, vslc_name, geno.genoparts['variant_single_locus_complement']) geno.addPartsToVSLC( vslc_id, allele1_id, allele2_id, zygosity_id, geno.object_properties['has_alternate_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc gu.addType( g, vslc_id, Genotype.genoparts['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype( genomic_background_id, strain_name, geno.genoparts['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = \ strain_name + '/' + phenotyping_center pheno_center_strain_id = \ '-'.join((re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center))) if not re.match(r'^_', pheno_center_strain_id): pheno_center_strain_id = '_'+pheno_center_strain_id if self.nobnodes: pheno_center_strain_id = ':'+pheno_center_strain_id geno.addGenotype(pheno_center_strain_id, pheno_center_strain_label, geno.genoparts['genomic_background']) geno.addSequenceDerivesFrom(pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name+' ['+pheno_center_strain_label+']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(pheno_center_strain_id, taxon_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) genotype_name += '['+colony+']' geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id+sex)) sex_qualified_genotype_label = genotype_name+' ('+sex+')' if sex == 'male': sq_type_id = geno.genoparts['male_genotype'] elif sex == 'female': sq_type_id = geno.genoparts['female_genotype'] else: sq_type_id = geno.genoparts['sex_qualified_genotype'] geno.addGenotype( sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts( genotype_id, sex_qualified_genotype_id, geno.object_properties['has_alternate_part']) if genomic_background_id is not None and \ genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender phenotype_id = mp_term_id # it seems that sometimes phenotype ids are missing. # indicate here if phenotype_id is None or phenotype_id == '': logger.warning( "No phenotype id specified for row %d: %s", line_counter, str(row)) continue # experimental_phenotypic_evidence This was used in ZFIN eco_id = "ECO:0000059" # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc(self.name, sex_qualified_genotype_id, phenotype_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph(g) assoc_id = assoc.get_association_id() # add a free-text description description = \ ' '.join((mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) gu.addDescription(g, assoc_id, description) # TODO add provenance information # resource_id = resource_name # assoc.addSource(g, assoc_id, resource_id) if not self.testMode and \ limit is not None and line_counter > limit: break gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP) gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP) return
def _process_omim2gene(self, limit=None): """ This method maps the OMIM IDs and KEGG gene ID. Currently split based on the link_type field. Equivalent link types are mapped as gene XRefs. Reverse link types are mapped as disease to gene associations. Original link types are currently skipped. Triples created: <kegg_gene_id> is a Gene <omim_gene_id> is a Gene <kegg_gene_id>> hasXref <omim_gene_id> <assoc_id> has subject <omim_disease_id> <assoc_id> has object <kegg_gene_id> :param limit: :return: """ LOG.info("Processing OMIM to KEGG gene") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) line_counter = 0 geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files['omim2gene']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (kegg_gene_id, omim_id, link_type) = row if self.test_mode and kegg_gene_id not in self.test_ids['genes']: continue kegg_gene_id = 'KEGG-' + kegg_gene_id.strip() omim_id = re.sub(r'omim', 'OMIM', omim_id) if link_type == 'equivalent': # these are genes! # so add them as a class then make equivalence model.addClassToGraph(omim_id, None) geno.addGene(kegg_gene_id, None) if not DipperUtil.is_omim_disease(omim_id): model.addEquivalentClass(kegg_gene_id, omim_id) elif link_type == 'reverse': # make an association between an OMIM ID & the KEGG gene ID # we do this with omim ids because # they are more atomic than KEGG ids alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id) alt_label = self.label_hash[alt_locus_id] model.addIndividualToGraph( alt_locus_id, alt_label, self.globaltt['variant_locus']) geno.addAffectedLocus(alt_locus_id, kegg_gene_id) model.addBlankNodeAnnotation(alt_locus_id) # Add the disease to gene relationship. rel = self.globaltt['is marker for'] assoc = G2PAssoc(graph, self.name, alt_locus_id, omim_id, rel) assoc.add_association_to_graph() elif link_type == 'original': # these are sometimes a gene, and sometimes a disease LOG.info( 'Unable to handle original link for %s-%s', kegg_gene_id, omim_id) else: # don't know what these are LOG.warning( 'Unhandled link type for %s-%s: %s', kegg_gene_id, omim_id, link_type) if (not self.test_mode) and ( limit is not None and line_counter > limit): break LOG.info("Done with OMIM to KEGG gene") return
def _process_phenotype_data(self, limit): """ NOTE: If a Strain carries more than one mutation, then each Mutation description, i.e., the set: ( Mutation Type - Chromosome - Gene Symbol - Gene Name - Allele Symbol - Allele Name) will require a separate line. Note that MMRRC curates phenotypes to alleles, even though they distribute only one file with the phenotypes appearing to be associated with a strain. So, here we process the allele-to-phenotype relationships separately from the strain-to-allele relationships. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 fname = '/'.join((self.rawdir, self.files['catalog']['file'])) self.strain_hash = {} self.id_label_hash = {} genes_with_no_ids = set() stem_cell_class = 'CL:0000034' mouse_taxon = 'NCBITaxon:10090' geno = Genotype(g) with open(fname, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 # skip the first 3 lines which are header, etc. if line_counter < 4: continue (strain_id, strain_label, strain_type_symbol, strain_state, mgi_allele_id, mgi_allele_symbol, mgi_allele_name, mutation_type, chrom, mgi_gene_id, mgi_gene_symbol, mgi_gene_name, sds_url, accepted_date, mp_ids, pubmed_nums, research_areas) = row if self.testMode and (strain_id not in self.test_ids) \ or mgi_gene_name == 'withdrawn': continue # strip off stuff after the dash - # is the holding center important? # MMRRC:00001-UNC --> MMRRC:00001 strain_id = re.sub(r'-\w+$', '', strain_id) self.id_label_hash[strain_id] = strain_label # get the variant or gene to save for later building of # the genotype if strain_id not in self.strain_hash: self.strain_hash[strain_id] = { 'variants': set(), 'genes': set() } # clean up the bad one if mgi_allele_id == 'multiple mutation': logger.error("Erroneous gene id: %s", mgi_allele_id) mgi_allele_id = '' if mgi_allele_id != '': self.strain_hash[strain_id]['variants'].add(mgi_allele_id) self.id_label_hash[mgi_allele_id] = mgi_allele_symbol # use the following if needing to add the # sequence alteration types # var_type = # self._get_variant_type_from_abbrev(mutation_type) # make a sequence alteration for this variant locus, # and link the variation type to it # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA' # if self.nobnodes: # sa_id = ':'+sa_id # gu.addIndividualToGraph(g, sa_id, None, var_type) # geno.addSequenceAlterationToVariantLocus(sa_id, # mgi_allele_id) # scrub out any spaces mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id) if mgi_gene_id.strip() != '': if re.match(r'Gene\s*ID:', mgi_gene_id, re.I): mgi_gene_id = re.sub(r'Gene\s*ID:\s*', 'NCBIGene:', mgi_gene_id) elif not re.match(r'MGI', mgi_gene_id): logger.info("Gene id not recognized: %s", mgi_gene_id) if re.match(r'\d+$', mgi_gene_id): # assume that if it's all numbers, then it's MGI mgi_gene_id = 'MGI:' + str(mgi_gene_id) logger.info("Assuming numerics are MGI.") self.strain_hash[strain_id]['genes'].add(mgi_gene_id) self.id_label_hash[mgi_gene_id] = mgi_gene_symbol # catch some errors - # some things have gene labels, but no identifiers - report if mgi_gene_symbol.strip() != '' and mgi_gene_id == '': logger.error( "Gene label with no identifier for strain %s: %s", strain_id, mgi_gene_symbol) genes_with_no_ids.add(mgi_gene_symbol.strip()) # make a temp id for genes that aren't identified # tmp_gene_id = '_'+mgi_gene_symbol # self.id_label_hash[tmp_gene_id] = mgi_gene_symbol # self.strain_hash[strain_id]['genes'].add(tmp_gene_id) # split apart the mp ids # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ... # mp_ids are now a comma delimited list # with MP terms in brackets phenotype_ids = [] if mp_ids != '': for i in re.split(r',', mp_ids): i = i.strip() mps = re.search(r'\[(.*)\]', i) if mps is not None: mp_id = mps.group(1).strip() phenotype_ids.append(mp_id) # pubmed ids are space delimited pubmed_ids = [] if pubmed_nums.strip() != '': for i in re.split(r'\s+', pubmed_nums): pmid = 'PMID:' + i.strip() pubmed_ids.append(pmid) r = Reference(g, pmid, Reference.ref_types['journal_article']) r.addRefToGraph() # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001 # is a good example of 4 genotype parts model.addClassToGraph(mouse_taxon, None) if research_areas.strip() == '': research_areas = None else: research_areas = 'Research Areas: ' + research_areas strain_type = mouse_taxon if strain_state == 'ES': strain_type = stem_cell_class model.addIndividualToGraph( strain_id, strain_label, strain_type, research_areas) # an inst of mouse?? model.makeLeader(strain_id) # phenotypes are associated with the alleles for pid in phenotype_ids: # assume the phenotype label is in the ontology model.addClassToGraph(pid, None) if mgi_allele_id is not None and mgi_allele_id != '': assoc = G2PAssoc( g, self.name, mgi_allele_id, pid, model.object_properties['has_phenotype']) for p in pubmed_ids: assoc.add_source(p) assoc.add_association_to_graph() else: logger.info("Phenotypes and no allele for %s", strain_id) if not self.testMode and (limit is not None and line_counter > limit): break # now that we've collected all of the variant information, build it # we don't know their zygosities for s in self.strain_hash: h = self.strain_hash.get(s) variants = h['variants'] genes = h['genes'] vl_set = set() # make variant loci for each gene if len(variants) > 0: for v in variants: vl_id = v vl_symbol = self.id_label_hash[vl_id] geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) vl_set.add(vl_id) if len(variants) == 1 and len(genes) == 1: for gene in genes: geno.addAlleleOfGene(vl_id, gene) else: geno.addAllele(vl_id, vl_symbol) else: # len(vars) == 0 # it's just anonymous variants in some gene for gene in genes: vl_id = '_:' + re.sub(r':', '', gene) + '-VL' vl_symbol = self.id_label_hash[gene] + '<?>' self.id_label_hash[vl_id] = vl_symbol geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) geno.addGene(gene, self.id_label_hash[gene]) geno.addAlleleOfGene(vl_id, gene) vl_set.add(vl_id) # make the vslcs vl_list = sorted(vl_set) vslc_list = [] for vl in vl_list: # for unknown zygosity vslc_id = re.sub(r'^_', '', vl) + 'U' vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:' + vslc_id vslc_label = self.id_label_hash[vl] + '/?' self.id_label_hash[vslc_id] = vslc_label vslc_list.append(vslc_id) geno.addPartsToVSLC( vslc_id, vl, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part'], None) model.addIndividualToGraph( vslc_id, vslc_label, geno.genoparts['variant_single_locus_complement']) if len(vslc_list) > 0: if len(vslc_list) > 1: gvc_id = '-'.join(vslc_list) gvc_id = re.sub(r'_|:', '', gvc_id) gvc_id = '_:' + gvc_id gvc_label = \ '; '.join(self.id_label_hash[v] for v in vslc_list) model.addIndividualToGraph( gvc_id, gvc_label, geno.genoparts['genomic_variation_complement']) for vslc_id in vslc_list: geno.addVSLCtoParent(vslc_id, gvc_id) else: # the GVC == VSLC, so don't have to make an extra piece gvc_id = vslc_list.pop() gvc_label = self.id_label_hash[gvc_id] genotype_label = gvc_label + ' [n.s.]' bkgd_id = \ re.sub(r':', '', '-'.join( (geno.genoparts['unspecified_genomic_background'], s))) genotype_id = '-'.join((gvc_id, bkgd_id)) bkgd_id = '_:' + bkgd_id geno.addTaxon(mouse_taxon, bkgd_id) geno.addGenomicBackground( bkgd_id, 'unspecified (' + s + ')', geno.genoparts['unspecified_genomic_background'], "A placeholder for the " + "unspecified genetic background for " + s) geno.addGenomicBackgroundToGenotype( bkgd_id, genotype_id, geno.genoparts['unspecified_genomic_background']) geno.addParts(gvc_id, genotype_id, geno.object_properties['has_alternate_part']) geno.addGenotype(genotype_id, genotype_label) g.addTriple(s, geno.object_properties['has_genotype'], genotype_id) else: # logger.debug( # "Strain %s is not making a proper genotype.", s) pass logger.warning( "The following gene symbols did not list identifiers: %s", str(sorted(list(genes_with_no_ids)))) return
def _process_qtls_genetic_location( self, raw, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ if self.testMode: graph = self.testgraph else: graph = self.graph line_counter = 0 geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm, flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model_id, test_base, sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio, trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row if self.testMode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = 'AQTLTrait:' + trait_id.strip() # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:'+common_name.strip()+'-linkage' build_label = common_name+' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm)] else: LOG.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:'+peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref(qtl_id, dbsnp_id) gene_id = gene_id.replace('uncharacterized ', '').strip() if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error( ' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = '_:' + re.sub( r':', '', gene_id) + '-' + peak_mark.strip() geno.addSequenceAlterationToVariantLocus( dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph(trait_id, trait_name) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:'+pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() if not self.testMode and limit is not None and line_counter > limit: break LOG.info("Done with QTL genetic info") return
def _process_omim2gene(self, limit=None): """ This method maps the OMIM IDs and KEGG gene ID. Currently split based on the link_type field. Equivalent link types are mapped as gene XRefs. Reverse link types are mapped as disease to gene associations. Original link types are currently skipped. Triples created: <kegg_gene_id> is a Gene <omim_gene_id> is a Gene <kegg_gene_id>> hasXref <omim_gene_id> <assoc_id> has subject <omim_disease_id> <assoc_id> has object <kegg_gene_id> :param limit: :return: """ LOG.info("Processing OMIM to KEGG gene") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files['omim2gene']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (kegg_gene_id, omim_id, link_type) = row if self.test_mode and kegg_gene_id not in self.test_ids['genes']: continue kegg_gene_id = 'KEGG-' + kegg_gene_id.strip() omim_id = re.sub(r'omim', 'OMIM', omim_id) if link_type == 'equivalent': # these are genes! # so add them as a class then make equivalence model.addClassToGraph(omim_id, None) geno.addGene(kegg_gene_id, None) # previous: if omim type is not disease-ish then use # now is: if omim type is gene then use if omim_id in self.omim_replaced: repl = self.omim_replaced[omim_id] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: omim_id = omim if omim_id in self.omim_type and \ self.omim_type[omim_id] == self.globaltt['gene']: model.addEquivalentClass(kegg_gene_id, omim_id) elif link_type == 'reverse': # make an association between an OMIM ID & the KEGG gene ID # we do this with omim ids because # they are more atomic than KEGG ids alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id) alt_label = self.label_hash[alt_locus_id] model.addIndividualToGraph( alt_locus_id, alt_label, self.globaltt['variant_locus']) geno.addAffectedLocus(alt_locus_id, kegg_gene_id) model.addBlankNodeAnnotation(alt_locus_id) # Add the disease to gene relationship. rel = self.globaltt['is marker for'] assoc = G2PAssoc(graph, self.name, alt_locus_id, omim_id, rel) assoc.add_association_to_graph() elif link_type == 'original': # these are sometimes a gene, and sometimes a disease LOG.info( 'Unable to handle original link for %s-%s', kegg_gene_id, omim_id) else: # don't know what these are LOG.warning( 'Unhandled link type for %s-%s: %s', kegg_gene_id, omim_id, link_type) if (not self.test_mode) and ( limit is not None and reader.line_num > limit): break LOG.info("Done with OMIM to KEGG gene")
def _process_genes(self, limit=None): """ This method processes the KEGG gene IDs. The label for the gene is pulled as the first symbol in the list of gene symbols; the rest are added as synonyms. The long-form of the gene name is added as a definition. This is hardcoded to just processes human genes. Triples created: <gene_id> is a SO:gene <gene_id> rdfs:label <gene_name> :param limit: :return: """ LOG.info("Processing genes") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) family = Family(graph) geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files['hsa_genes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (gene_id, gene_name) = row gene_id = 'KEGG-'+gene_id.strip() # the gene listing has a bunch of labels # that are delimited, as: # DST, BP240, BPA, BPAG1, CATX-15, CATX15, D6S1101, DMH, DT, # EBSB2, HSAN6, MACF2; dystonin; K10382 dystonin # it looks like the list is semicolon delimited # (symbol, name, gene_class) # where the symbol is a comma-delimited list # here, we split them up. # we will take the first abbreviation and make it the symbol # then take the rest as synonyms gene_stuff = re.split('r;', gene_name) symbollist = re.split(r',', gene_stuff[0]) first_symbol = symbollist[0].strip() if gene_id not in self.label_hash: self.label_hash[gene_id] = first_symbol if self.test_mode and gene_id not in self.test_ids['genes']: continue # Add the gene as a class. geno.addGene(gene_id, first_symbol) # add the long name as the description if len(gene_stuff) > 1: description = gene_stuff[1].strip() model.addDefinition(gene_id, description) # add the rest of the symbols as synonyms for i in enumerate(symbollist, start=1): model.addSynonym(gene_id, i[1].strip()) if len(gene_stuff) > 2: ko_part = gene_stuff[2] ko_match = re.search(r'K\d+', ko_part) if ko_match is not None and len(ko_match.groups()) == 1: ko = 'KEGG-ko:'+ko_match.group(1) family.addMemberOf(gene_id, ko) if not self.test_mode and limit is not None and reader.line_num > limit: break LOG.info("Done with genes")
def _process_data(self, raw, limit=None): logger.info("Processing Data from %s", raw) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) line_counter = 0 impc_map = self.open_and_parse_yaml(self.map_files['impc_map']) impress_map = json.loads( self.fetch_from_url( self.map_files['impress_map']).read().decode('utf-8')) # Add the taxon as a class taxon_id = 'NCBITaxon:10090' # map to Mus musculus model.addClassToGraph(taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: with gzip.open(raw, 'rt') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: line_counter += 1 (marker_accession_id, marker_symbol, phenotyping_center, colony, sex, zygosity, allele_accession_id, allele_symbol, allele_name, strain_accession_id, strain_name, project_name, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, top_level_mp_term_id, top_level_mp_term_name, mp_term_id, mp_term_name, p_value, percentage_change, effect_size, statistical_method, resource_name) = row if self.testMode and marker_accession_id not in self.test_ids: continue # ##### cleanup some of the identifiers ###### zygosity_id = self._map_zygosity(zygosity) # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_:' + re.sub(r'\W+', '_', colony) if not re.match(r'MGI', allele_accession_id): allele_accession_id = \ '_:IMPC-'+re.sub(r':', '', allele_accession_id) if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC strain_accession_id = '_:' + strain_accession_id elif not re.match(r'MGI', strain_accession_id): logger.info("Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:' + strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = \ re.match(r'.*<(.*)>', allele_symbol).group(1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and \ marker_accession_id == '': logger.warning("Marker unspecified on row %d", line_counter) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = geno.genoparts['variant_locus'] geno.addGene(marker_accession_id, marker_symbol, geno.genoparts['gene']) geno.addAllele(variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) sequence_alteration_id = \ '_:seqalt'+re.sub(r':', '', allele_accession_id) geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration(sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, # with unknown zygosity stem_cell_class = 'ERO:0002002' model.addIndividualToGraph(colony_id, colony, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = \ '_:'+re.sub(r':', '', allele_accession_id+geno.zygosity['indeterminate']) vslc_colony_label = allele_symbol + '/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts(allele_accession_id, colony_genotype_id, geno.object_properties['has_alternate_part']) geno.addPartsToVSLC( vslc_colony, allele_accession_id, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part']) g.addTriple(colony_id, geno.object_properties['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = geno.object_properties['has_alternate_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: logger.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '-'.join( (marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:' + vslc_id model.addIndividualToGraph( vslc_id, vslc_name, geno.genoparts['variant_single_locus_complement']) geno.addPartsToVSLC( vslc_id, allele1_id, allele2_id, zygosity_id, geno.object_properties['has_alternate_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc model.addType( vslc_id, Genotype.genoparts['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype(genomic_background_id, strain_name, geno.genoparts['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = \ strain_name + '-' + phenotyping_center + '-' + colony pheno_center_strain_id = \ '-'.join((re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center), re.sub(r'\W+', '', colony))) if not re.match(r'^_', pheno_center_strain_id): pheno_center_strain_id = '_:' + pheno_center_strain_id geno.addGenotype(pheno_center_strain_id, pheno_center_strain_label, geno.genoparts['genomic_background']) geno.addSequenceDerivesFrom(pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name+' ['+pheno_center_strain_label+']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(taxon_id, pheno_center_strain_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id+sex)) sex_qualified_genotype_label = genotype_name + ' (' + sex + ')' if sex == 'male': sq_type_id = geno.genoparts['male_genotype'] elif sex == 'female': sq_type_id = geno.genoparts['female_genotype'] else: sq_type_id = geno.genoparts['sex_qualified_genotype'] geno.addGenotype(sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts(genotype_id, sex_qualified_genotype_id, geno.object_properties['has_alternate_part']) if genomic_background_id is not None and \ genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender phenotype_id = mp_term_id # it seems that sometimes phenotype ids are missing. # indicate here if phenotype_id is None or phenotype_id == '': logger.warning("No phenotype id specified for row %d: %s", line_counter, str(row)) continue # hard coded ECO code eco_id = "ECO:0000015" # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc(g, self.name, sex_qualified_genotype_id, phenotype_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph() assoc_id = assoc.get_association_id() # add a free-text description try: description = \ ' '.join((mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) except ValueError: description = \ ' '.join((mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(effect_size), '(p =', "{0}".format(p_value), ').')) study_bnode = \ self._add_study_provenance( impc_map, impress_map, phenotyping_center, colony, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name) evidence_line_bnode = \ self._add_evidence( assoc_id, eco_id, impc_map, p_value, percentage_change, effect_size, study_bnode) self._add_assertion_provenance(assoc_id, evidence_line_bnode, impc_map) model.addDescription(evidence_line_bnode, description) # resource_id = resource_name # assoc.addSource(g, assoc_id, resource_id) if not self.testMode and \ limit is not None and line_counter > limit: break return
def _process_phenotype_data(self, limit): """ NOTE: If a Strain carries more than one mutation, then each Mutation description, i.e., the set: ( Mutation Type - Chromosome - Gene Symbol - Gene Name - Allele Symbol - Allele Name) will require a separate line. Note that MMRRC curates phenotypes to alleles, even though they distribute only one file with the phenotypes appearing to be associated with a strain. So, here we process the allele-to-phenotype relationships separately from the strain-to-allele relationships. :param limit: :return: """ src_key = 'catalog' if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) fname = '/'.join((self.rawdir, self.files[src_key]['file'])) self.strain_hash = {} self.id_label_hash = {} genes_with_no_ids = set() stem_cell_class = self.globaltt['stem cell'] mouse_taxon = self.globaltt['Mus musculus'] geno = Genotype(graph) with open(fname, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='\"') # First line is header not date/version info. This changed recently, # apparently as of Sep 2019. Also, 3rd line is no longer blank. row = [x.strip() for x in next(reader)] # messy messy col = self.files['catalog']['columns'] strain_missing_allele = [] # to count the ones w/insufficent info if not self.check_fileheader(col, row): pass for row in reader: strain_id = row[col.index('STRAIN/STOCK_ID')].strip() strain_label = row[col.index('STRAIN/STOCK_DESIGNATION')] # strain_type_symbol = row[col.index('STRAIN_TYPE')] strain_state = row[col.index('STATE')] mgi_allele_id = row[col.index( 'MGI_ALLELE_ACCESSION_ID')].strip() mgi_allele_symbol = row[col.index('ALLELE_SYMBOL')] # mgi_allele_name = row[col.index('ALLELE_NAME')] # mutation_type = row[col.index('MUTATION_TYPE')] # chrom = row[col.index('CHROMOSOME')] mgi_gene_id = row[col.index('MGI_GENE_ACCESSION_ID')].strip() mgi_gene_symbol = row[col.index('GENE_SYMBOL')].strip() mgi_gene_name = row[col.index('GENE_NAME')] # sds_url = row[col.index('SDS_URL')] # accepted_date = row[col.index('ACCEPTED_DATE')] mpt_ids = row[col.index('MPT_IDS')].strip() pubmed_nums = row[col.index('PUBMED_IDS')].strip() research_areas = row[col.index('RESEARCH_AREAS')].strip() if self.test_mode and (strain_id not in self.test_ids) \ or mgi_gene_name == 'withdrawn': continue # strip off stuff after the dash - # is the holding center important? # MMRRC:00001-UNC --> MMRRC:00001 strain_id = re.sub(r'-\w+$', '', strain_id) self.id_label_hash[strain_id] = strain_label # get the variant or gene to save for later building of # the genotype if strain_id not in self.strain_hash: self.strain_hash[strain_id] = { 'variants': set(), 'genes': set() } # flag bad ones if mgi_allele_id[:4] != 'MGI:' and mgi_allele_id != '': LOG.error("Erroneous MGI allele id: %s", mgi_allele_id) if mgi_allele_id[:3] == 'MG:': mgi_allele_id = 'MGI:' + mgi_allele_id[3:] else: mgi_allele_id = '' if mgi_allele_id != '': self.strain_hash[strain_id]['variants'].add(mgi_allele_id) self.id_label_hash[mgi_allele_id] = mgi_allele_symbol # use the following if needing to add the sequence alteration types # var_type = self.localtt[mutation_type] # make a sequence alteration for this variant locus, # and link the variation type to it # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA' # if self.nobnodes: # sa_id = ':'+sa_id # gu.addIndividualToGraph(g, sa_id, None, var_type) # geno.addSequenceAlterationToVariantLocus(sa_id, mgi_allele_id) # scrub out any spaces, fix known issues mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id) if mgi_gene_id == 'NULL': mgi_gene_id = '' elif mgi_gene_id[:7] == 'GeneID:': mgi_gene_id = 'NCBIGene:' + mgi_gene_id[7:] if mgi_gene_id != '': try: [curie, localid] = mgi_gene_id.split(':') except ValueError as verror: LOG.warning( "Problem parsing mgi_gene_id %s from file %s: %s", mgi_gene_id, fname, verror) if curie not in ['MGI', 'NCBIGene']: LOG.info("MGI Gene id not recognized: %s", mgi_gene_id) self.strain_hash[strain_id]['genes'].add(mgi_gene_id) self.id_label_hash[mgi_gene_id] = mgi_gene_symbol # catch some errors - too many. report summary at the end # some things have gene labels, but no identifiers - report if mgi_gene_symbol != '' and mgi_gene_id == '': # LOG.error( # "Gene label with no MGI identifier for strain %s: %s", # strain_id, mgi_gene_symbol) genes_with_no_ids.add(mgi_gene_symbol) # make a temp id for genes that aren't identified ... err wow. # tmp_gene_id = '_' + mgi_gene_symbol # self.id_label_hash[tmp_gene_id.strip()] = mgi_gene_symbol # self.strain_hash[strain_id]['genes'].add(tmp_gene_id) # split apart the mp ids # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ... # mpt_ids are a comma delimited list # labels with MP terms following in brackets phenotype_ids = [] if mpt_ids != '': for lb_mp in mpt_ids.split(r','): lb_mp = lb_mp.strip() if lb_mp[-1:] == ']' and lb_mp[-12:-8] == '[MP:': phenotype_ids.append(lb_mp[-11:-2]) # pubmed ids are space delimited pubmed_ids = [] if pubmed_nums != '': for pm_num in re.split(r'\s+', pubmed_nums): pmid = 'PMID:' + pm_num.strip() pubmed_ids.append(pmid) ref = Reference(graph, pmid, self.globaltt['journal article']) ref.addRefToGraph() # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001 # is a good example of 4 genotype parts model.addClassToGraph(mouse_taxon, None) if research_areas == '': research_areas = None else: research_areas = 'Research Areas: ' + research_areas strain_type = mouse_taxon if strain_state == 'ES': strain_type = stem_cell_class model.addIndividualToGraph( # an inst of mouse?? strain_id, strain_label, strain_type, research_areas) model.makeLeader(strain_id) # phenotypes are associated with the alleles for pid in phenotype_ids: # assume the phenotype label is in some ontology model.addClassToGraph(pid, None) if mgi_allele_id is not None and mgi_allele_id != '': assoc = G2PAssoc(graph, self.name, mgi_allele_id, pid, self.globaltt['has phenotype']) for p in pubmed_ids: assoc.add_source(p) assoc.add_association_to_graph() else: # too chatty here. report aggregate # LOG.info("Phenotypes and no allele for %s", strain_id) strain_missing_allele.append(strain_id) if not self.test_mode and (limit is not None and reader.line_num > limit): break # report misses if strain_missing_allele: LOG.info("Phenotypes and no allele for %i strains", len(strain_missing_allele)) # now that we've collected all of the variant information, build it # we don't know their zygosities for s in self.strain_hash: h = self.strain_hash.get(s) variants = h['variants'] genes = h['genes'] vl_set = set() # make variant loci for each gene if variants: for var in variants: vl_id = var.strip() vl_symbol = self.id_label_hash[vl_id] geno.addAllele(vl_id, vl_symbol, self.globaltt['variant_locus']) vl_set.add(vl_id) if len(variants) == 1 and len(genes) == 1: for gene in genes: geno.addAlleleOfGene(vl_id, gene) else: geno.addAllele(vl_id, vl_symbol) else: # len(vars) == 0 # it's just anonymous variants in some gene for gene in genes: vl_id = '_:' + re.sub(r':', '', gene) + '-VL' vl_symbol = self.id_label_hash[gene] + '<?>' self.id_label_hash[vl_id] = vl_symbol geno.addAllele(vl_id, vl_symbol, self.globaltt['variant_locus']) geno.addGene(gene, self.id_label_hash[gene]) geno.addAlleleOfGene(vl_id, gene) vl_set.add(vl_id) # make the vslcs vl_list = sorted(vl_set) vslc_list = [] for vl in vl_list: # for unknown zygosity vslc_id = re.sub(r'^_', '', vl) + 'U' vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:' + vslc_id vslc_label = self.id_label_hash[vl] + '/?' self.id_label_hash[vslc_id] = vslc_label vslc_list.append(vslc_id) geno.addPartsToVSLC(vslc_id, vl, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part'], None) model.addIndividualToGraph( vslc_id, vslc_label, self.globaltt['variant single locus complement']) if vslc_list: if len(vslc_list) > 1: gvc_id = '-'.join(vslc_list) gvc_id = re.sub(r'_|:', '', gvc_id) gvc_id = '_:' + gvc_id gvc_label = '; '.join(self.id_label_hash[v] for v in vslc_list) model.addIndividualToGraph( gvc_id, gvc_label, self.globaltt['genomic_variation_complement']) for vslc_id in vslc_list: geno.addVSLCtoParent(vslc_id, gvc_id) else: # the GVC == VSLC, so don't have to make an extra piece gvc_id = vslc_list.pop() gvc_label = self.id_label_hash[gvc_id] genotype_label = gvc_label + ' [n.s.]' bkgd_id = re.sub( r':', '', '-'.join( (self.globaltt['unspecified_genomic_background'], s))) genotype_id = '-'.join((gvc_id, bkgd_id)) bkgd_id = '_:' + bkgd_id geno.addTaxon(mouse_taxon, bkgd_id) geno.addGenomicBackground( bkgd_id, 'unspecified (' + s + ')', self.globaltt['unspecified_genomic_background'], "A placeholder for the unspecified genetic background for " + s) geno.addGenomicBackgroundToGenotype( bkgd_id, genotype_id, self.globaltt['unspecified_genomic_background']) geno.addParts(gvc_id, genotype_id, self.globaltt['has_variant_part']) geno.addGenotype(genotype_id, genotype_label) graph.addTriple(s, self.globaltt['has_genotype'], genotype_id) else: # LOG.debug( # "Strain %s is not making a proper genotype.", s) pass LOG.warning( "The following gene symbols did not list identifiers: %s", str(sorted(list(genes_with_no_ids)))) LOG.error('%i symbols given are missing their gene identifiers', len(genes_with_no_ids)) return
def _process_allele_gene(self, limit): """ Make associations between an allele and a gene Adds triples to self.graph Approach is to use the label nomenclature and species map to determine taxon. Foreign Transgenes are filtered out. :param limit: number of rows to process :return: None """ geno = Genotype(self.graph) species_map = self._species_to_ncbi_tax() src_key = 'allele_gene' raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("processing allele to gene") col = self.files[src_key]['columns'] with gzip.open(raw, 'rt') as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') # skip first line, version info next(reader) row = next(reader) # headers # header line starts with a hash and tab ?? row = row[1:] self.check_fileheader(col, row) for row in reader: allele_id = row[col.index('AlleleID')] allele_label = row[col.index('AlleleSymbol')] gene_id = row[col.index('GeneID')] gene_label = row[col.index('GeneSymbol')] allele_curie = 'FlyBase:' + allele_id gene_curie = 'FlyBase:' + gene_id # Add Allele and taxon, skip anything that's not drosophila allele_prefix = re.findall(r'^(\w*)\\', allele_label) if len(allele_prefix) == 1: try: if species_map[allele_prefix[0]][0] == 'drosophilid': geno.addAllele(allele_curie, allele_label) geno.addTaxon(species_map[allele_prefix[0]][1], allele_curie) else: # If it's a foreign transgenic allele, skip continue except KeyError: LOG.info("%s not in species prefix file", allele_prefix[0]) continue elif not allele_prefix: geno.addAllele(allele_curie, allele_label) geno.addTaxon(self.globaltt['Drosophila melanogaster'], allele_curie) else: raise ValueError( "Did not correctly parse allele label {}".format( allele_label)) # Process genes gene_prefix = re.findall(r'^(\w*)\\', gene_label) if len(gene_prefix) == 1: try: geno.addTaxon(species_map[gene_prefix[0]][1], gene_curie) if species_map[gene_prefix[0]][0] == 'drosophilid': geno.addGene(gene_curie, gene_label) else: # Don't create labels for non drosophila genes geno.addGene(gene_curie) except KeyError: LOG.info("%s not in species prefix file", gene_prefix[0]) geno.addGene(gene_curie) elif not gene_prefix: geno.addGene(gene_curie, gene_label) geno.addTaxon(self.globaltt['Drosophila melanogaster'], allele_curie) else: raise ValueError( "Did not correct parse gene label {}".format( gene_label)) # Connect allele and gene with geno.addAffectedLocus() if allele_prefix and gene_prefix: if allele_prefix[0] == gene_prefix[0]: geno.addAffectedLocus(allele_curie, gene_curie) else: raise ValueError( "Found allele and gene with different " "prefixes: {}, {}".format(allele_id, gene_id)) elif not allele_prefix and gene_prefix: raise ValueError("Found allele and gene with different " "prefixes: {}, {}".format( allele_id, gene_id)) else: # Both are melanogaster geno.addAffectedLocus(allele_curie, gene_curie) if limit is not None and reader.line_num > limit: break
def _process_genes(self, limit=None): """ This method processes the KEGG gene IDs. The label for the gene is pulled as the first symbol in the list of gene symbols; the rest are added as synonyms. The long-form of the gene name is added as a definition. This is hardcoded to just processes human genes. Triples created: <gene_id> is a SO:gene <gene_id> rdfs:label <gene_name> :param limit: :return: """ LOG.info("Processing genes") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) line_counter = 0 family = Family(graph) geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files['hsa_genes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (gene_id, gene_name) = row gene_id = 'KEGG-'+gene_id.strip() # the gene listing has a bunch of labels # that are delimited, as: # DST, BP240, BPA, BPAG1, CATX-15, CATX15, D6S1101, DMH, DT, # EBSB2, HSAN6, MACF2; dystonin; K10382 dystonin # it looks like the list is semicolon delimited # (symbol, name, gene_class) # where the symbol is a comma-delimited list # here, we split them up. # we will take the first abbreviation and make it the symbol # then take the rest as synonyms gene_stuff = re.split('r;', gene_name) symbollist = re.split(r',', gene_stuff[0]) first_symbol = symbollist[0].strip() if gene_id not in self.label_hash: self.label_hash[gene_id] = first_symbol if self.test_mode and gene_id not in self.test_ids['genes']: continue # Add the gene as a class. geno.addGene(gene_id, first_symbol) # add the long name as the description if len(gene_stuff) > 1: description = gene_stuff[1].strip() model.addDefinition(gene_id, description) # add the rest of the symbols as synonyms for i in enumerate(symbollist, start=1): model.addSynonym(gene_id, i[1].strip()) if len(gene_stuff) > 2: ko_part = gene_stuff[2] ko_match = re.search(r'K\d+', ko_part) if ko_match is not None and len(ko_match.groups()) == 1: ko = 'KEGG-ko:'+ko_match.group(1) family.addMemberOf(gene_id, ko) if not self.test_mode and limit is not None and line_counter > limit: break LOG.info("Done with genes") return
def _process_genes(self, limit=None): """ This method processes the KEGG gene IDs. The label for the gene is pulled as the first symbol in the list of gene symbols; the rest are added as synonyms. The long-form of the gene name is added as a definition. This is hardcoded to just processes human genes. Triples created: <gene_id> is a SO:gene <gene_id> rdfs:label <gene_name> :param limit: :return: """ logger.info("Processing genes") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) geno = Genotype(g) raw = '/'.join((self.rawdir, self.files['hsa_genes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (gene_id, gene_name) = row gene_id = 'KEGG-'+gene_id.strip() # the gene listing has a bunch of labels that are delimited, like: # DST, BP240, BPA, BPAG1, CATX-15, CATX15, D6S1101, DMH, DT, EBSB2, HSAN6, MACF2; dystonin; K10382 dystonin # it looks like the list is semicolon delimited (symbol, name, gene_class) # where the symbol is a comma-delimited list # here, we split them up. we will take the first abbreviation and make it the symbol # then take the rest as synonyms gene_stuff = re.split(';', gene_name) symbollist = re.split(',', gene_stuff[0]) first_symbol = symbollist[0].strip() if gene_id not in self.label_hash: self.label_hash[gene_id] = first_symbol if self.testMode and gene_id not in self.test_ids['genes']: continue # Add the gene as a class. geno.addGene(gene_id, first_symbol) # add the long name as the description if len(gene_stuff) > 1: description = gene_stuff[1].strip() gu.addDefinition(g, gene_id, description) # add the rest of the symbols as synonyms for i in enumerate(symbollist, start=1): gu.addSynonym(g, gene_id, i[1].strip()) # TODO add the KO here? if (not self.testMode) and (limit is not None and line_counter > limit): break logger.info("Done with genes") return
def _process_phenotype_data(self, limit): """ NOTE: If a Strain carries more than one mutation, then each Mutation description, i.e., the set: ( Mutation Type - Chromosome - Gene Symbol - Gene Name - Allele Symbol - Allele Name) will require a separate line. Note that MMRRC curates phenotypes to alleles, even though they distribute only one file with the phenotypes appearing to be associated with a strain. So, here we process the allele-to-phenotype relationships separately from the strain-to-allele relationships. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) fname = '/'.join((self.rawdir, self.files['catalog']['file'])) self.strain_hash = {} self.id_label_hash = {} genes_with_no_ids = set() stem_cell_class = 'CL:0000034' mouse_taxon = 'NCBITaxon:10090' geno = Genotype(g) with open(fname, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 # skip the first 3 lines which are header, etc. if line_counter < 4: continue (strain_id, strain_label, strain_type_symbol, strain_state, mgi_allele_id, mgi_allele_symbol, mgi_allele_name, mutation_type, chrom, mgi_gene_id, mgi_gene_symbol, mgi_gene_name, sds_url, accepted_date, mp_ids, pubmed_nums, research_areas) = row if self.testMode and (strain_id not in self.test_ids): continue # strip off stuff after the dash - # is the holding center important? # MMRRC:00001-UNC --> MMRRC:00001 strain_id = re.sub(r'-\w+$', '', strain_id) self.id_label_hash[strain_id] = strain_label # get the variant or gene to save for later building of # the genotype if strain_id not in self.strain_hash: self.strain_hash[strain_id] = {'variants': set(), 'genes': set()} # clean up the bad one if mgi_allele_id == 'multiple mutation': logger.error("Erroneous gene id: %s", mgi_allele_id) mgi_allele_id = '' if mgi_allele_id != '': self.strain_hash[strain_id]['variants'].add(mgi_allele_id) self.id_label_hash[mgi_allele_id] = mgi_allele_symbol # use the following if needing to add the # sequence alteration types # var_type = # self._get_variant_type_from_abbrev(mutation_type) # make a sequence alteration for this variant locus, # and link the variation type to it # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA' # if self.nobnodes: # sa_id = ':'+sa_id # gu.addIndividualToGraph(g, sa_id, None, var_type) # geno.addSequenceAlterationToVariantLocus(sa_id, # mgi_allele_id) # scrub out any spaces mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id) if mgi_gene_id.strip() != '': if re.match(r'Gene\s*ID:', mgi_gene_id, re.I): mgi_gene_id = re.sub(r'Gene\s*ID:\s*', 'NCBIGene:', mgi_gene_id) elif not re.match(r'MGI', mgi_gene_id): logger.info("Gene id not recognized: %s", mgi_gene_id) if re.match(r'\d+$', mgi_gene_id): # assume that if it's all numbers, then it's MGI mgi_gene_id = 'MGI:'+str(mgi_gene_id) logger.info("Assuming numerics are MGI.") self.strain_hash[strain_id]['genes'].add(mgi_gene_id) self.id_label_hash[mgi_gene_id] = mgi_gene_symbol # catch some errors - # some things have gene labels, but no identifiers - report if mgi_gene_symbol.strip() != '' and mgi_gene_id == '': logger.error( "Gene label with no identifier for strain %s: %s", strain_id, mgi_gene_symbol) genes_with_no_ids.add(mgi_gene_symbol.strip()) # make a temp id for genes that aren't identified # tmp_gene_id = '_'+mgi_gene_symbol # self.id_label_hash[tmp_gene_id] = mgi_gene_symbol # self.strain_hash[strain_id]['genes'].add(tmp_gene_id) # split apart the mp ids # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ... # mp_ids are now a comma delimited list # with MP terms in brackets phenotype_ids = [] if mp_ids != '': for i in re.split(r',', mp_ids): i = i.strip() mps = re.search(r'\[(.*)\]', i) if mps is not None: mp_id = mps.group(1).strip() phenotype_ids.append(mp_id) # pubmed ids are space delimited pubmed_ids = [] if pubmed_nums.strip() != '': for i in re.split(r'\s+', pubmed_nums): pmid = 'PMID:'+i.strip() pubmed_ids.append(pmid) r = Reference(pmid, Reference.ref_types['journal_article']) r.addRefToGraph(g) # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001 # is a good example of 4 genotype parts gu.addClassToGraph(g, mouse_taxon, None) if research_areas.strip() == '': research_areas = None else: research_areas = 'Research Areas: '+research_areas strain_type = mouse_taxon if strain_state == 'ES': strain_type = stem_cell_class gu.addIndividualToGraph( g, strain_id, strain_label, strain_type, research_areas) # an inst of mouse?? gu.makeLeader(g, strain_id) # phenotypes are associated with the alleles for pid in phenotype_ids: # assume the phenotype label is in the ontology gu.addClassToGraph(g, pid, None) if mgi_allele_id is not None and mgi_allele_id != '': assoc = G2PAssoc(self.name, mgi_allele_id, pid, gu.object_properties['has_phenotype']) for p in pubmed_ids: assoc.add_source(p) assoc.add_association_to_graph(g) else: logger.info("Phenotypes and no allele for %s", strain_id) if not self.testMode and ( limit is not None and line_counter > limit): break # now that we've collected all of the variant information, build it # we don't know their zygosities for s in self.strain_hash: h = self.strain_hash.get(s) variants = h['variants'] genes = h['genes'] vl_set = set() # make variant loci for each gene if len(variants) > 0: for v in variants: vl_id = v vl_symbol = self.id_label_hash[vl_id] geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) vl_set.add(vl_id) if len(variants) == 1 and len(genes) == 1: for gene in genes: geno.addAlleleOfGene(vl_id, gene) else: geno.addAllele(vl_id, vl_symbol) else: # len(vars) == 0 # it's just anonymous variants in some gene for gene in genes: vl_id = '_'+gene+'-VL' vl_id = re.sub(r':', '', vl_id) if self.nobnodes: vl_id = ':'+vl_id vl_symbol = self.id_label_hash[gene]+'<?>' self.id_label_hash[vl_id] = vl_symbol geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) geno.addGene(gene, self.id_label_hash[gene]) geno.addAlleleOfGene(vl_id, gene) vl_set.add(vl_id) # make the vslcs vl_list = sorted(vl_set) vslc_list = [] for vl in vl_list: # for unknown zygosity vslc_id = '_'+re.sub(r'^_', '', vl)+'U' vslc_id = re.sub(r':', '', vslc_id) if self.nobnodes: vslc_id = ':' + vslc_id vslc_label = self.id_label_hash[vl] + '/?' self.id_label_hash[vslc_id] = vslc_label vslc_list.append(vslc_id) geno.addPartsToVSLC( vslc_id, vl, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part'], None) gu.addIndividualToGraph( g, vslc_id, vslc_label, geno.genoparts['variant_single_locus_complement']) if len(vslc_list) > 0: if len(vslc_list) > 1: gvc_id = '-'.join(vslc_list) gvc_id = re.sub(r':', '', gvc_id) if self.nobnodes: gvc_id = ':'+gvc_id gvc_label = \ '; '.join(self.id_label_hash[v] for v in vslc_list) gu.addIndividualToGraph( g, gvc_id, gvc_label, geno.genoparts['genomic_variation_complement']) for vslc_id in vslc_list: geno.addVSLCtoParent(vslc_id, gvc_id) else: # the GVC == VSLC, so don't have to make an extra piece gvc_id = vslc_list.pop() gvc_label = self.id_label_hash[gvc_id] genotype_label = gvc_label + ' [n.s.]' bkgd_id = \ '_' + re.sub(r':', '', '-'.join( (geno.genoparts['unspecified_genomic_background'], s))) genotype_id = '-'.join((gvc_id, bkgd_id)) if self.nobnodes: bkgd_id = ':'+bkgd_id geno.addTaxon(mouse_taxon, bkgd_id) geno.addGenomicBackground( bkgd_id, 'unspecified ('+s+')', geno.genoparts['unspecified_genomic_background'], "A placeholder for the " + "unspecified genetic background for "+s) geno.addGenomicBackgroundToGenotype( bkgd_id, genotype_id, geno.genoparts['unspecified_genomic_background']) geno.addParts( gvc_id, genotype_id, geno.object_properties['has_alternate_part']) geno.addGenotype(genotype_id, genotype_label) gu.addTriple( g, s, geno.object_properties['has_genotype'], genotype_id) else: # logger.debug( # "Strain %s is not making a proper genotype.", s) pass gu.loadProperties( g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP) gu.loadProperties( g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP) gu.loadProperties( g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP) gu.loadAllProperties(g) logger.warning( "The following gene symbols did not list identifiers: %s", str(sorted(list(genes_with_no_ids)))) return
def _process_omim2gene(self, limit=None): """ This method maps the OMIM IDs and KEGG gene ID. Currently split based on the link_type field. Equivalent link types are mapped as gene XRefs. Reverse link types are mapped as disease to gene associations. Original link types are currently skipped. Triples created: <kegg_gene_id> is a Gene <omim_gene_id> is a Gene <kegg_gene_id>> hasXref <omim_gene_id> <assoc_id> has subject <omim_disease_id> <assoc_id> has object <kegg_gene_id> :param limit: :return: """ logger.info("Processing OMIM to KEGG gene") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu = GraphUtils(curie_map.get()) raw = '/'.join((self.rawdir, self.files['omim2gene']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (kegg_gene_id, omim_id, link_type) = row if self.testMode and kegg_gene_id not in self.test_ids['genes']: continue kegg_gene_id = 'KEGG-'+kegg_gene_id.strip() omim_id = re.sub('omim', 'OMIM', omim_id) if link_type == 'equivalent': # these are genes! so add them as a class then make equivalence gu.addClassToGraph(g, omim_id, None) geno.addGene(kegg_gene_id, None) gu.addEquivalentClass(g, kegg_gene_id, omim_id) elif link_type == 'reverse': # make an association between an OMIM ID and the KEGG gene ID # we do this with omim ids because they are more atomic than KEGG ids alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id) alt_label = self.label_hash[alt_locus_id] gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts['variant_locus']) geno.addAlleleOfGene(alt_locus_id, kegg_gene_id) # Add the disease to gene relationship. rel = gu.object_properties['is_marker_for'] assoc = G2PAssoc(self.name, alt_locus_id, omim_id, rel) assoc.add_association_to_graph(g) elif link_type == 'original': # these are sometimes a gene, and sometimes a disease logger.info('Unable to handle original link for %s-%s', kegg_gene_id, omim_id) else: # don't know what these are logger.warn('Unhandled link type for %s-%s: %s', kegg_gene_id, omim_id, link_type) if (not self.testMode) and (limit is not None and line_counter > limit): break logger.info("Done with OMIM to KEGG gene") gu.loadProperties(g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP) gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP) return
def _process_data(self, raw, limit=None): LOG.info("Processing Data from %s", raw) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) # Add the taxon as a class taxon_id = self.globaltt['Mus musculus'] model.addClassToGraph(taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: col = self.files['all']['columns'] with gzip.open(raw, 'rt') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='\"') row = next(reader) # presumed header if not self.check_fileheader(col, row): pass for row in reader: # | head -1 | tr ',' '\n' | sed "s|\(.*\)|# \1 = row[col.index('\1')]|g" marker_accession_id = row[col.index('marker_accession_id')].strip() marker_symbol = row[col.index('marker_symbol')].strip() phenotyping_center = row[col.index('phenotyping_center')].strip() colony_raw = row[col.index('colony_id')].strip() sex = row[col.index('sex')].strip() zygosity = row[col.index('zygosity')].strip() allele_accession_id = row[col.index('allele_accession_id')].strip() allele_symbol = row[col.index('allele_symbol')].strip() # allele_name = row[col.index('allele_name')] strain_accession_id = row[col.index('strain_accession_id')].strip() strain_name = row[col.index('strain_name')].strip() # project_name = row[col.index('project_name')] project_fullname = row[col.index('project_fullname')].strip() pipeline_name = row[col.index('pipeline_name')].strip() pipeline_stable_id = row[col.index('pipeline_stable_id')].strip() procedure_stable_id = row[col.index('procedure_stable_id')].strip() procedure_name = row[col.index('procedure_name')].strip() parameter_stable_id = row[col.index('parameter_stable_id')].strip() parameter_name = row[col.index('parameter_name')].strip() # top_level_mp_term_id = row[col.index('top_level_mp_term_id')] # top_level_mp_term_name = row[col.index('top_level_mp_term_name')] mp_term_id = row[col.index('mp_term_id')].strip() mp_term_name = row[col.index('mp_term_name')].strip() p_value = row[col.index('p_value')].strip() percentage_change = row[col.index('percentage_change')].strip() effect_size = row[col.index('effect_size')].strip() statistical_method = row[col.index('statistical_method')].strip() resource_name = row[col.index('resource_name')].strip() if self.test_mode and marker_accession_id not in self.gene_ids: continue # ##### cleanup some of the identifiers ###### zygosity = zygosity.strip() zygosity_id = self.resolve(zygosity) if zygosity_id == zygosity: LOG.warning( "Zygosity '%s' unmapped. detting to indeterminate", zygosity) zygosity_id = self.globaltt['indeterminate'] # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_:' + re.sub(r'\W+', '_', colony_raw) if not re.match(r'MGI', allele_accession_id): allele_accession_id = '_:IMPC-'+re.sub( r':', '', allele_accession_id) if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC # TODO blank nodes do not maintain identifiers strain_accession_id = '_:' + strain_accession_id elif not re.match(r'MGI', strain_accession_id): LOG.info( "Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:'+strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = re.match( r'.*<(.*)>', allele_symbol) if sequence_alteration_name is not None: sequence_alteration_name = sequence_alteration_name.group(1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and marker_accession_id == '': LOG.warning("Marker unspecified on row %d", reader.line_num) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = self.globaltt['variant_locus'] geno.addGene( marker_accession_id, marker_symbol, self.globaltt['gene']) geno.addAllele( variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) # TAG bnode sequence_alteration_id = '_:seqalt' + re.sub( r':', '', allele_accession_id) geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration( sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, with unknown zygosity stem_cell_class = self.globaltt['embryonic stem cell line'] if colony_id is None: print(colony_raw, stem_cell_class, "\nline:\t", reader.line_num) model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = '_:'+re.sub( r':', '', allele_accession_id + self.globaltt['indeterminate']) vslc_colony_label = allele_symbol + '/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts( allele_accession_id, colony_genotype_id, self.globaltt['has_variant_part']) geno.addPartsToVSLC( vslc_colony, allele_accession_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) graph.addTriple( colony_id, self.globaltt['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = self.globaltt['has_variant_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: LOG.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '-'.join( (marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:'+vslc_id model.addIndividualToGraph( vslc_id, vslc_name, self.globaltt['variant single locus complement']) geno.addPartsToVSLC( vslc_id, allele1_id, allele2_id, zygosity_id, self.globaltt['has_variant_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc model.addType(vslc_id, self.globaltt['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype( genomic_background_id, strain_name, self.globaltt['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = strain_name + '-' + phenotyping_center \ + '-' + colony_raw pheno_center_strain_id = '-'.join(( re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center), re.sub(r'\W+', '', colony_raw))) if not re.match(r'^_', pheno_center_strain_id): # Tag bnode pheno_center_strain_id = '_:' + pheno_center_strain_id geno.addGenotype( pheno_center_strain_id, pheno_center_strain_label, self.globaltt['genomic_background']) geno.addSequenceDerivesFrom( pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name + ' [' + pheno_center_strain_label + ']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(taxon_id, pheno_center_strain_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id(( colony_id + phenotyping_center + zygosity + strain_accession_id + sex)) sex_qualified_genotype_label = genotype_name + ' (' + sex + ')' sq_type_id = self.resolve(sex, False) if sq_type_id == sex: sq_type_id = self.globaltt['intrinsic_genotype'] LOG.warning( "Unknown sex qualifier %s, adding as intrinsic_genotype", sex) geno.addGenotype( sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts( genotype_id, sex_qualified_genotype_id, self.globaltt['has_variant_part']) if genomic_background_id is not None and genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender # sometimes phenotype ids are missing. (about 711 early 2020) if mp_term_id is None or mp_term_id == '': LOG.warning( "No phenotype id specified for row %d", reader.line_num) continue # hard coded ECO code eco_id = self.globaltt['mutant phenotype evidence'] # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc( graph, self.name, sex_qualified_genotype_id, mp_term_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph() assoc_id = assoc.get_association_id() model._addSexSpecificity(assoc_id, self.resolve(sex)) # add a free-text description try: description = ' '.join(( mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) except ValueError: description = ' '.join(( mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(effect_size), '(p =', "{0}".format(p_value), ').')) study_bnode = self._add_study_provenance( phenotyping_center, colony_raw, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name) evidence_line_bnode = self._add_evidence( assoc_id, eco_id, p_value, percentage_change, effect_size, study_bnode) self._add_assertion_provenance(assoc_id, evidence_line_bnode) model.addDescription(evidence_line_bnode, description) # resource_id = resource_name # assoc.addSource(graph, assoc_id, resource_id) if not self.test_mode and limit is not None and reader.line_num > limit: break
def _process_QTLs_genetic_location(self, raw, taxon_id, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu = GraphUtils(curie_map.get()) eco_id = "ECO:0000061" # Quantitative Trait Analysis Evidence logger.info("Processing genetic location for %s", taxon_id) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm, flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model, test_base, sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio, trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row if self.testMode and int(qtl_id) not in self.test_ids: continue qtl_id = 'AQTL:'+qtl_id trait_id = 'AQTLTrait:'+trait_id # Add QTL to graph f = Feature(qtl_id, qtl_symbol, geno.genoparts['QTL']) f.addTaxonToFeature(g, taxon_id) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_id, 'CHR') # add a version of the chromosome which is defined as the genetic map build_id = 'MONARCH:'+common_name.strip()+'-linkage' build_label = common_name+' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_id) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance(chromosome, build_id, build_label, chrom_id) start = stop = None if re.search('-', range_cm): range_parts = re.split('-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and range_parts[0] != '' and range_parts[1] != '': (start, stop) = [int(float(x.strip())) for x in re.split('-', range_cm)] else: logger.info("There's a cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop when schema can handle floats # add in the genetic location based on the range f.addFeatureStartLocation(start, chrom_in_build_id, None, [Feature.types['FuzzyPosition']]) f.addFeatureEndLocation(stop, chrom_in_build_id, None, [Feature.types['FuzzyPosition']]) f.addFeatureToGraph(g) # sometimes there's a peak marker, like a rsid. we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and re.match('rs', peak_mark.strip()): dbsnp_id = 'dbSNP:'+peak_mark.strip() gu.addIndividualToGraph(g, dbsnp_id, None, geno.genoparts['sequence_alteration']) gu.addXref(g, qtl_id, dbsnp_id) if gene_id is not None and gene_id != '' and gene_id != '.': if gene_id_src == 'NCBIgene' or gene_id_src == '': # we assume if no src is provided, it's NCBI gene_id = 'NCBIGene:'+gene_id.strip() geno.addGene(gene_id, None) # we will expect that these labels provided elsewhere geno.addAlleleOfGene(qtl_id, gene_id, geno.object_properties['feature_to_gene_relation']) # FIXME what is the right relationship here? if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = '_' + re.sub(':', '', gene_id) + '-' + peak_mark if self.nobnodes: vl_id = ':' + vl_id geno.addSequenceAlterationToVariantLocus(dbsnp_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) # add the trait gu.addClassToGraph(g, trait_id, trait_name) # Add publication r = None if re.match('ISU.*', pubmed_id): pub_id = 'AQTLPub:'+pubmed_id.strip() r = Reference(pub_id) elif pubmed_id != '': pub_id = 'PMID:'+pubmed_id.strip() r = Reference(pub_id, Reference.ref_types['journal_article']) if r is not None: r.addRefToGraph(g) # make the association to the QTL assoc = G2PAssoc(self.name, qtl_id, trait_id, gu.object_properties['is_marker_for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': score = float(re.sub('<', '', p_values)) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph(g) # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc(self.name, dbsnp_id, trait_id, gu.object_properties['is_marker_for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': score = float(re.sub('<', '', p_values)) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph(g) if not self.testMode and limit is not None and line_counter > limit: break logger.info("Done with QTL genetic info") return