def parse(self, limit=None): """ :param limit: :return: """ if limit is not None: logger.info("Only parsing first %s rows fo each file", str(limit)) logger.info("Parsing files...") if self.testOnly: self.testMode = True g = self.testgraph else: g = self.graph tmap = '/'.join((self.rawdir, self.files['trait_mappings']['file'])) self._process_trait_mappings(tmap, limit) geno = Genotype(g) # organisms = ['chicken'] organisms = [ 'chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle'] for o in organisms: tax_id = self._get_tax_by_common_name(o) geno.addGenome(tax_id, o) build_id = None build = None k = o+'_bp' if k in self.files: file = self.files[k]['file'] m = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', file) if m is None: logger.error("Can't match a gff build") else: build = m.group(1) build_id = self._map_build_by_abbrev(build) logger.info("Build = %s", build_id) geno.addReferenceGenome(build_id, build, tax_id) if build_id is not None: self._process_QTLs_genomic_location( '/'.join((self.rawdir, file)), tax_id, build_id, build, limit) k = o+'_cm' if k in self.files: file = self.files[k]['file'] self._process_QTLs_genetic_location( '/'.join((self.rawdir, file)), tax_id, o, limit) logger.info("Finished parsing") self.load_bindings() logger.info("Found %d nodes", len(self.graph)) return
def parse(self, limit=None): """ :param limit: :return: """ if limit is not None: logger.info("Only parsing first %s rows fo each file", str(limit)) logger.info("Parsing files...") if self.testOnly: self.testMode = True g = self.testgraph else: g = self.graph tmap = '/'.join((self.rawdir, self.files['trait_mappings']['file'])) self._process_trait_mappings(tmap, limit) geno = Genotype(g) # organisms = ['chicken'] organisms = [ 'chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle'] for o in organisms: tax_id = self._get_tax_by_common_name(o) geno.addGenome(tax_id, o) build_id = None build = None k = o + '_bp' if k in self.files: file = self.files[k]['file'] m = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', file) if m is None: logger.error("Can't match a gff build") else: build = m.group(1) build_id = self._map_build_by_abbrev(build) logger.info("Build = %s", build_id) geno.addReferenceGenome(build_id, build, tax_id) if build_id is not None: self._process_QTLs_genomic_location( '/'.join((self.rawdir, file)), tax_id, build_id, build, limit) k = o+'_cm' if k in self.files: file = self.files[k]['file'] self._process_QTLs_genetic_location( '/'.join((self.rawdir, file)), tax_id, o, limit) logger.info("Finished parsing") return
def _create_genome_builds(self): """ Various resources will map variations to either UCSC (hg*) or to NCBI assemblies. Here we create the equivalences between them. Data taken from: https://genome.ucsc.edu/FAQ/FAQreleases.html#release1 :return: """ # TODO add more species graph = self.graph geno = Genotype(graph) model = Model(graph) logger.info("Adding equivalent assembly identifiers") for sp in self.species: tax_id = self.resolve(sp) txid_num = tax_id.split(':')[1] for key in self.files[txid_num]['assembly']: ucsc_id = key try: ucsc_label = ucsc_id.split(':')[1] except IndexError: logger.error('%s Assembly id: "%s" is problematic', sp, key) continue if key in self.localtt: mapped_id = self.localtt[key] else: logger.error( '%s Assembly id: "%s" is not in local translation table', sp, key) mapped_label = mapped_id.split(':')[1] mapped_label = 'NCBI build ' + str(mapped_label) geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id) geno.addReferenceGenome(mapped_id, mapped_label, tax_id) model.addSameIndividual(ucsc_id, mapped_id) return
def _create_genome_builds(self): """ Various resources will map variations to either UCSC (hg*) or to NCBI assemblies. Here we create the equivalences between them. Data taken from: https://genome.ucsc.edu/FAQ/FAQreleases.html#release1 :return: """ # TODO add more species graph = self.graph geno = Genotype(graph) model = Model(graph) LOG.info("Adding equivalent assembly identifiers") for sp in self.species: tax_id = self.globaltt[sp] txid_num = tax_id.split(':')[1] for key in self.files[txid_num]['assembly']: ucsc_id = key try: ucsc_label = ucsc_id.split(':')[1] except IndexError: LOG.error('%s Assembly id: "%s" is problematic', sp, key) continue if key in self.localtt: mapped_id = self.localtt[key] else: LOG.error( '%s Assembly id: "%s" is not in local translation table', sp, key) mapped_label = mapped_id.split(':')[1] mapped_label = 'NCBI build ' + str(mapped_label) geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id) geno.addReferenceGenome(mapped_id, mapped_label, tax_id) model.addSameIndividual(ucsc_id, mapped_id) return
def _get_chrbands(self, limit, taxon): """ :param limit: :return: """ model = Model(self.graph) # TODO PYLINT figure out what limit was for and why it is unused line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom(self.graph_type, self.are_bnodes_skized) # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:'+taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) geno.addGenome(taxon_id, genome_label) # add the build and the taxon it's in build_num = self.files[taxon]['build_num'] build_id = 'UCSC:'+build_num geno.addReferenceGenome(build_id, build_num, taxon_id) # process the bands with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue # chr13 4500000 10000000 p12 stalk (scaffold, start, stop, band_num, rtype) = line.split('\t') line_counter += 1 # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))' unlocalized_scaffold_pattern = \ placed_scaffold_pattern+r'_(\w+)_random' unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)' m = re.match(placed_scaffold_pattern+r'$', scaffold) if m is not None and len(m.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = m.group(1) else: # skip over anything that isn't a placed_scaffold # at the class level logger.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold) m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold) scaffold_num = None if m: pass elif m_chr_unloc is not None and\ len(m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num+'_'+m_chr_unloc.group(2) elif m_chr_unplaced is not None and\ len(m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) else: logger.error( "There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass( chrom_num, taxon_id, self.files[taxon]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = { 'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': Feature.types['chromosome']} if scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = { 'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': Feature.types['assembly_component'], 'synonym': scaffold} if band_num is not None and band_num.strip() != '': # add the specific band mybands[chrom_num+band_num] = {'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': None} # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): mybands[chrom_num+band_num]['stain'] = \ Feature.types.get(rtype) # get the parent bands, and make them unique parents = list( monochrom.make_parent_bands(band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num+band_num]['parent'] = \ chrom_num+parents[0] else: # TODO PYLINT why is 'parent' # a list() a couple of lines up and a set() here? parents = set() # loop through the parents and add them to the hash # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i]) pnum = chrom_num+parents[i] sta = int(start) sto = int(stop) if pnum not in mybands.keys(): # add the parental band to the hash b = {'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti} mybands[pnum] = b else: # band already in the hash means it's a grouping band # need to update the min/max coords b = mybands.get(pnum) b['min'] = min(sta, sto, b['min']) b['max'] = max(sta, sto, b['max']) mybands[pnum] = b # also, set the max for the chrom c = mybands.get(chrom_num) c['max'] = max(sta, sto, c['max']) mybands[chrom_num] = c # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num+parents[i+1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num f.close() # end looping through file # loop through the hash and add the bands to the graph for b in mybands.keys(): myband = mybands.get(b) band_class_id = makeChromID(b, taxon, 'CHR') band_class_label = makeChromLabel(b, genome_label) band_build_id = makeChromID(b, build_num, 'MONARCH') band_build_label = makeChromLabel(b, build_num) # the build-specific chrom chrom_in_build_id = makeChromID( myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != Feature.types['assembly_component']: model.addClassToGraph(band_class_id, band_class_label, myband['type']) bfeature = Feature(self.graph, band_build_id, band_build_label, band_class_id) else: bfeature = Feature(self.graph, band_build_id, band_build_label, myband['type']) if 'synonym' in myband: model.addSynonym(band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == Feature.types['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == Feature.types['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: # TODO 'has_staining_intensity' being dropped by MB bfeature.addFeatureProperty( Feature.properties['has_staining_intensity'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(False) return
def _process_qtls_genetic_location( self, raw, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ aql_curie = self.files[common_name + '_cm']['curie'] if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm, flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model_id, test_base, sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio, trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row if self.test_mode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = ':'.join((aql_curie, trait_id.strip())) # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:'+common_name.strip()+'-linkage' build_label = common_name+' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm)] else: LOG.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:'+peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref(qtl_id, dbsnp_id) gene_id = gene_id.replace('uncharacterized ', '').strip() if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error( ' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = '_:' + re.sub( r':', '', gene_id) + '-' + peak_mark.strip() geno.addSequenceAlterationToVariantLocus( dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph(trait_id, trait_name) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:'+pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() if not self.test_mode and limit is not None and line_counter > limit: break LOG.info("Done with QTL genetic info") return
def parse(self, limit=None): """ :param limit: :return: """ if limit is not None: LOG.info("Only parsing first %s rows fo each file", str(limit)) LOG.info("Parsing files...") if self.test_only: self.test_mode = True graph = self.testgraph else: graph = self.graph traitmap = '/'.join((self.rawdir, self.files['trait_mappings']['file'])) self._process_trait_mappings(traitmap, limit) geno = Genotype(graph) animals = ['chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle'] for common_name in animals: txid_num = self.resolve(common_name).split(':')[1] taxon_label = self.localtt[common_name] taxon_curie = self.globaltt[taxon_label] taxon_num = taxon_curie.split(':')[1] txid_num = taxon_num # for now taxon_word = taxon_label.replace(' ', '_') gene_info_file = '/'.join(( self.rawdir, self.files[taxon_word + '_info']['file'])) self.gene_info = list() LOG.info('Ingesting %s', gene_info_file) with gzip.open(gene_info_file, 'rt') as gi_gz: filereader = csv.reader(gi_gz, delimiter='\t') for row in filereader: if row[0][0] == '#': continue else: self.gene_info.append(str(row[1])) # tossing lots of good stuff LOG.info( 'Gene Info for %s has %i enteries', common_name, len(self.gene_info)) # LOG.info('Gene Info entery looks like %s', self.gene_info[5]) build = None fname_bp = common_name + '_bp' if fname_bp in self.files: bpfile = self.files[fname_bp]['file'] mch = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', bpfile) if mch is None: LOG.error("Can't match a gff build to " + fname_bp) else: build = mch.group(1) build_id = self.localtt[build] LOG.info("Build UCSC label is: %s", build_id) # NCBI assembly curie is geno.addReferenceGenome(build_id, build, txid_num) if build_id is not None: self._process_qtls_genomic_location( '/'.join((self.rawdir, bpfile)), txid_num, build_id, build, common_name, limit) fname_cm = common_name + '_cm' if fname_cm in self.files: cmfile = self.files[fname_cm]['file'] self._process_qtls_genetic_location( '/'.join((self.rawdir, cmfile)), txid_num, common_name, limit) LOG.info("Finished parsing") return
def _process_qtls_genetic_location( self, raw, src_key, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ aql_curie = self.files[src_key]['curie'] common_name = common_name.strip() if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # no header in these files, so no header checking col = self.files[src_key]['columns'] col_len = len(col) for row in reader: if len(row) != col_len and ''.join(row[col_len:]) != '': LOG.warning( "Problem parsing %s line %i containing: \n%s\n" "got %i cols but expected %i", raw, reader.line_num, row, len(row), col_len) # LOG.info(row) continue qtl_id = row[col.index('QTL_ID')].strip() qtl_symbol = row[col.index('QTL_symbol')].strip() trait_name = row[col.index('Trait_name')].strip() # assotype = row[col.index('assotype')].strip() chromosome = row[col.index('Chromosome')].strip() position_cm = row[col.index('Position_cm')].strip() range_cm = row[col.index('range_cm')].strip() # flankmark_a2 = row[col.index('FlankMark_A2')].strip() # flankmark_a1 = row[col.index('FlankMark_A1')].strip() peak_mark = row[col.index('Peak_Mark')].strip() # flankmark_b1 = row[col.index('FlankMark_B1')].strip() # flankmark_b2 = row[col.index('FlankMark_B2')].strip() # exp_id = row[col.index('Exp_ID')].strip() # model_id = row[col.index('Model')].strip() # test_base = row[col.index('testbase')].strip() # sig_level = row[col.index('siglevel')].strip() # lod_score = row[col.index('LOD_score')].strip() # ls_mean = row[col.index('LS_mean')].strip() p_values = row[col.index('P_values')].strip() # f_statistics = row[col.index('F_Statistics')].strip() # variance = row[col.index('VARIANCE')].strip() # bayes_value = row[col.index('Bayes_value')].strip() # likelihood_ratio = row[col.index('LikelihoodR')].strip() trait_id = row[col.index('TRAIT_ID')].strip() # dom_effect = row[col.index('Dom_effect')].strip() # add_effect = row[col.index('Add_effect')].strip() pubmed_id = row[col.index('PUBMED_ID')].strip() gene_id = row[col.index('geneID')].strip() gene_id_src = row[col.index('geneIDsrc')].strip() # gene_id_type = row[col.index('geneIDtype')].strip() if self.test_mode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = ':'.join((aql_curie, trait_id.strip())) # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:' + common_name + '-linkage' build_label = common_name + ' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm)] else: LOG.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:' + peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref( qtl_id, dbsnp_id, xref_category=blv.terms['SequenceVariant']) gene_id = gene_id.replace('uncharacterized ', '').strip() gene_id = gene_id.strip(',') # for "100157483," in pig_QTLdata.txt if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error( ' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id as a bnode vl_id = self.make_id(re.sub( r':', '', gene_id) + '-' + peak_mark.strip(), '_') geno.addSequenceAlterationToVariantLocus(dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph( trait_id, trait_name, class_category=blv.terms['PhenotypicFeature']) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:' + pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # off by one - the following actually gives us (limit + 1) records if not self.test_mode and limit is not None and reader.line_num > limit: break LOG.info("Done with QTL genetic info")
def parse(self, limit=None): """ :param limit: :return: """ if limit is not None: LOG.info("Only parsing first %s rows fo each file", str(limit)) if self.test_only: self.test_mode = True graph = self.testgraph else: graph = self.graph trait_src_key = 'trait_mappings' traitmap = '/'.join((self.rawdir, self.files[trait_src_key]['file'])) LOG.info("Parsing trait mapping file %s", traitmap) self._process_trait_mappings(traitmap, trait_src_key, limit) geno = Genotype(graph) animals = ['chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle'] for common_name in animals: txid_num = self.resolve(common_name).split(':')[1] taxon_label = self.localtt[common_name] taxon_curie = self.globaltt[taxon_label] taxon_num = taxon_curie.split(':')[1] txid_num = taxon_num # for now taxon_word = taxon_label.strip().replace(' ', '_') src_key = taxon_word + '_info' gene_info_file = '/'.join(( self.rawdir, self.files[src_key]['file'])) self.gene_info = list() LOG.info('Ingesting %s', gene_info_file) with gzip.open(gene_info_file, 'rt') as gi_gz: reader = csv.reader(gi_gz, delimiter='\t') # skipping header checking, b/c not all of these gene_info files have # headers col = self.files[src_key]['columns'] col_len = len(col) for row in reader: if row[0][0] == '#': # LOG.info(row) continue if len(row) != col_len and ''.join(row[col_len:]) != '': LOG.warning( "Problem parsing in %s row %i\n" "got %s cols but expected %s", gene_info_file, reader.line_num, len(row), col_len) LOG.info(row) self.gene_info.append(row[col.index('GeneID')]) LOG.info( 'Gene Info for %s has %i entries', common_name, len(self.gene_info)) build = None fname_bp = common_name + '_bp' if fname_bp in self.files: bpfile = self.files[fname_bp]['file'] mch = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', bpfile) if mch is None: LOG.error("Can't match a gff build to " + fname_bp) else: build = mch.group(1) build_id = self.localtt[build] LOG.info("Build UCSC label is: %s", build_id) geno.addReferenceGenome(build_id, build, txid_num) if build_id is not None: self._process_qtls_genomic_location( '/'.join((self.rawdir, bpfile)), fname_bp, txid_num, build_id, build, common_name, limit ) fname_cm = common_name + '_cm' if fname_cm in self.files: cmfile = self.files[fname_cm]['file'] self._process_qtls_genetic_location( '/'.join((self.rawdir, cmfile)), fname_cm, txid_num, common_name, limit) LOG.info("Finished parsing")
def _create_genome_builds(self): """ Various resources will map variations to either UCSC (hg*) or to NCBI assemblies. Here we create the equivalences between them. Data taken from: https://genome.ucsc.edu/FAQ/FAQreleases.html#release1 :return: """ # TODO add more species ucsc_assembly_id_map = { "9606": { "UCSC:hg38": "NCBIGenome:GRCh38", "UCSC:hg19": "NCBIGenome:GRCh37", "UCSC:hg18": "NCBIGenome:36.1", "UCSC:hg17": "NCBIGenome:35", "UCSC:hg16": "NCBIGenome:34", "UCSC:hg15": "NCBIGenome:33", }, "7955": { "UCSC:danRer10": "NCBIGenome:GRCz10", "UCSC:danRer7": "NCBIGenome:Zv9", "UCSC:danRer6": "NCBIGenome:Zv8", }, "10090": { "UCSC:mm10": "NCBIGenome:GRCm38", "UCSC:mm9": "NCBIGenome:37" }, "9031": { "UCSC:galGal4": "NCBIAssembly:317958", }, "9913": { "UCSC:bosTau7": "NCBIAssembly:GCF_000003205.5", }, "9823": { "UCSC:susScr3": "NCBIAssembly:304498", }, "9940": { "UCSC:oviAri3": "NCBIAssembly:GCF_000298735.1", }, "9796": { "UCSC:equCab2": "NCBIAssembly:GCF_000002305.2", } } g = self.graph geno = Genotype(g) model = Model(g) logger.info("Adding equivalent assembly identifiers") for sp in ucsc_assembly_id_map: tax_num = sp tax_id = 'NCBITaxon:' + tax_num mappings = ucsc_assembly_id_map[sp] for i in mappings: ucsc_id = i ucsc_label = re.split(':', i)[1] mapped_id = mappings[i] mapped_label = re.split(':', mapped_id)[1] mapped_label = 'NCBI build ' + str(mapped_label) geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id) geno.addReferenceGenome(mapped_id, mapped_label, tax_id) model.addSameIndividual(ucsc_id, mapped_id) return
def _get_chrbands(self, limit, src_key, genome_id): """ :param limit: :return: """ tax_num = src_key if limit is None: limit = sys.maxsize # practical limit anyway model = Model(self.graph) line_num = 0 myfile = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom(self.graph_type, self.are_bnodes_skized) # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[src_key]['genome_label'] taxon_curie = 'NCBITaxon:' + tax_num species_name = self.globaltcid[taxon_curie] # for logging # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_curie, None) model.addSynonym(taxon_curie, genome_label) geno.addGenome(taxon_curie, genome_label, genome_id) # add the build and the taxon it's in build_num = self.files[src_key]['build_num'] build_id = 'UCSC:' + build_num geno.addReferenceGenome(build_id, build_num, taxon_curie) # cat (at least) also has chr[BDAECF]... hex? must be a back cat. if tax_num == self.localtt['Felis catus']: placed_scaffold_regex = re.compile( r'(chr(?:[BDAECF]\d+|X|Y|Z|W|M|))$') else: placed_scaffold_regex = re.compile(r'(chr(?:\d+|X|Y|Z|W|M))$') unlocalized_scaffold_regex = re.compile(r'_(\w+)_random') unplaced_scaffold_regex = re.compile(r'chr(Un(?:_\w+)?)') # process the bands col = self.files[src_key]['columns'] with gzip.open(myfile, 'rb') as binreader: for line in binreader: line_num += 1 # skip comments line = line.decode().strip() if line[0] == '#' or line_num > limit: continue # chr13 4500000 10000000 p12 stalk row = line.split('\t') scaffold = row[col.index('chrom')].strip() start = row[col.index('chromStart')] stop = row[col.index('chromEnd')] band_num = row[col.index('name')].strip() rtype = row[col.index('gieStain')] # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 mch = placed_scaffold_regex.match(scaffold) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = mch.group(1) else: # skip over anything that isn't a placed_scaffold at the class level # LOG.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = unlocalized_scaffold_regex.match(scaffold) m_chr_unplaced = unplaced_scaffold_regex.match(scaffold) scaffold_num = None if mch: pass elif m_chr_unloc is not None and len( m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num + '_' + m_chr_unloc.group(2) elif m_chr_unplaced is not None and len( m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) # else: # LOG.error( # "There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, tax_num, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass( chrom_num, taxon_curie, self.files[src_key]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = { 'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': self.globaltt['chromosome'] } elif scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = { 'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': self.globaltt['assembly_component'], 'synonym': scaffold } else: LOG.info('%s line %i DROPPED chromosome/scaffold %s', species_name, line_num, scaffold) parents = list() # see it new types have showed up if rtype is not None and rtype not in [ 'gneg', 'gpos25', 'gpos33', 'gpos50', 'gpos66', 'gpos75', 'gpos100', 'acen', 'gvar', 'stalk' ]: LOG.info('Unknown gieStain type "%s" in %s at %i', rtype, src_key, line_num) self.globaltt[rtype] # blow up if rtype == 'acen': # hacky, revisit if ontology improves rtype = self.localtt[rtype] if band_num is not None and band_num != '' and \ rtype is not None and rtype != '': # add the specific band mybands[chrom_num + band_num] = { 'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': self.globaltt[rtype], } # add the staining intensity of the band # get the parent bands, and make them unique parents = list(monochrom.make_parent_bands( band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num + band_num]['parent'] = chrom_num + parents[0] # else: # band has no parents # loop through the parents and add them to the dict # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i], self.graph) pnum = chrom_num + parents[i] sta = int(start) sto = int(stop) if pnum is not None and pnum not in mybands.keys(): # add the parental band to the hash bnd = { 'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti } mybands[pnum] = bnd elif pnum is not None: # band already in the hash means it's a grouping band # need to update the min/max coords bnd = mybands.get(pnum) bnd['min'] = min(sta, sto, bnd['min']) bnd['max'] = max(sta, sto, bnd['max']) mybands[pnum] = bnd # also, set the max for the chrom chrom = mybands.get(chrom_num) chrom['max'] = max(sta, sto, chrom['max']) mybands[chrom_num] = chrom else: LOG.error("pnum is None") # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num + parents[i + 1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num binreader.close() # end looping through file # loop through the hash and add the bands to the graph for bnd in mybands.keys(): myband = mybands.get(bnd) band_class_id = makeChromID(bnd, tax_num, 'CHR') band_class_label = makeChromLabel(bnd, genome_label) band_build_id = makeChromID(bnd, build_num, 'MONARCH') band_build_label = makeChromLabel(bnd, build_num) # the build-specific chrom chrom_in_build_id = makeChromID(myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != self.globaltt['assembly_component']: model.addClassToGraph(band_class_id, band_class_label, myband['type']) bfeature = Feature(self.graph, band_build_id, band_build_label, band_class_id) else: bfeature = Feature(self.graph, band_build_id, band_build_label, myband['type']) if 'synonym' in myband: model.addSynonym(band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == self.globaltt['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == self.globaltt['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: bfeature.addFeatureProperty( self.globaltt['has_sequence_attribute'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(False)
def parse(self, limit=None): """ :param limit: :return: """ if limit is not None: LOG.info("Only parsing first %s rows fo each file", str(limit)) LOG.info("Parsing files...") if self.testOnly: self.testMode = True graph = self.testgraph else: graph = self.graph traitmap = '/'.join((self.rawdir, self.files['trait_mappings']['file'])) self._process_trait_mappings(traitmap, limit) geno = Genotype(graph) animals = ['chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle'] for common_name in animals: txid_num = self.resolve(common_name).split(':')[1] taxon_label = self.localtt[common_name] taxon_curie = self.globaltt[taxon_label] taxon_num = taxon_curie.split(':')[1] txid_num = taxon_num # for now taxon_word = taxon_label.replace(' ', '_') gene_info_file = '/'.join( (self.rawdir, self.files[taxon_word + '_info']['file'])) self.gene_info = list() with gzip.open(gene_info_file, 'rt') as gi_gz: filereader = csv.reader(gi_gz, delimiter='\t') for row in filereader: if row[0][0] == '#': continue else: self.gene_info.append(str(row[1])) # tossing lots of good stuff LOG.info( 'Gene Info for %s has %i enteries', common_name, len(self.gene_info)) # LOG.info('Gene Info entery looks like %s', self.gene_info[5]) build = None fname_bp = common_name + '_bp' if fname_bp in self.files: bpfile = self.files[fname_bp]['file'] mch = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', bpfile) if mch is None: LOG.error("Can't match a gff build to " + fname_bp) else: build = mch.group(1) build_id = self._map_build_by_abbrev(build) LOG.info("Build = %s", build_id) geno.addReferenceGenome(build_id, build, txid_num) if build_id is not None: self._process_qtls_genomic_location( '/'.join((self.rawdir, bpfile)), txid_num, build_id, build, common_name, limit) fname_cm = common_name + '_cm' if fname_cm in self.files: cmfile = self.files[fname_cm]['file'] self._process_qtls_genetic_location( '/'.join((self.rawdir, cmfile)), txid_num, common_name, limit) LOG.info("Finished parsing") return
def _get_chrbands(self, limit, taxon): """ :param limit: :return: """ model = Model(self.graph) # TODO PYLINT figure out what limit was for and why it is unused line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom(self.graph_type, self.are_bnodes_skized) # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:' + taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) geno.addGenome(taxon_id, genome_label) # add the build and the taxon it's in build_num = self.files[taxon]['build_num'] build_id = 'UCSC:' + build_num geno.addReferenceGenome(build_id, build_num, taxon_id) # process the bands with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue # chr13 4500000 10000000 p12 stalk (scaffold, start, stop, band_num, rtype) = line.split('\t') line_counter += 1 # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))' unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random' unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)' mch = re.match(placed_scaffold_pattern + r'$', scaffold) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = mch.group(1) else: # skip over anything that isn't a placed_scaffold # at the class level logger.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold) m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold) scaffold_num = None if mch: pass elif m_chr_unloc is not None and len( m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num + '_' + m_chr_unloc.group(2) elif m_chr_unplaced is not None and len( m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) else: logger.error( "There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass(chrom_num, taxon_id, self.files[taxon]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = { 'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': self.globaltt['chromosome'] } if scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = { 'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': self.globaltt['assembly_component'], 'synonym': scaffold } if band_num is not None and band_num.strip() != '': # add the specific band mybands[chrom_num + band_num] = { 'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': None } # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): mybands[chrom_num + band_num]['stain'] = self.resolve(rtype) # get the parent bands, and make them unique parents = list(monochrom.make_parent_bands( band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num + band_num]['parent'] = chrom_num + parents[0] else: # TODO PYLINT why is 'parent' # a list() a couple of lines up and a set() here? parents = set() # loop through the parents and add them to the hash # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i]) pnum = chrom_num + parents[i] sta = int(start) sto = int(stop) if pnum not in mybands.keys(): # add the parental band to the hash bnd = { 'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti } mybands[pnum] = bnd else: # band already in the hash means it's a grouping band # need to update the min/max coords bnd = mybands.get(pnum) bnd['min'] = min(sta, sto, bnd['min']) bnd['max'] = max(sta, sto, bnd['max']) mybands[pnum] = bnd # also, set the max for the chrom chrom = mybands.get(chrom_num) chrom['max'] = max(sta, sto, chrom['max']) mybands[chrom_num] = chrom # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num + parents[i + 1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num f.close() # end looping through file # loop through the hash and add the bands to the graph for bnd in mybands.keys(): myband = mybands.get(bnd) band_class_id = makeChromID(bnd, taxon, 'CHR') band_class_label = makeChromLabel(bnd, genome_label) band_build_id = makeChromID(bnd, build_num, 'MONARCH') band_build_label = makeChromLabel(bnd, build_num) # the build-specific chrom chrom_in_build_id = makeChromID(myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != self.globaltt['assembly_component']: model.addClassToGraph(band_class_id, band_class_label, myband['type']) bfeature = Feature(self.graph, band_build_id, band_build_label, band_class_id) else: bfeature = Feature(self.graph, band_build_id, band_build_label, myband['type']) if 'synonym' in myband: model.addSynonym(band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == self.globaltt['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == self.globaltt['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: bfeature.addFeatureProperty( self.globaltt['has_sequence_attribute'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(False) return
def _process_QTLs_genetic_location(self, raw, taxon_id, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu = GraphUtils(curie_map.get()) eco_id = "ECO:0000061" # Quantitative Trait Analysis Evidence logger.info("Processing genetic location for %s", taxon_id) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm, flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model, test_base, sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio, trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row if self.testMode and int(qtl_id) not in self.test_ids: continue qtl_id = 'AQTL:'+qtl_id trait_id = 'AQTLTrait:'+trait_id # Add QTL to graph f = Feature(qtl_id, qtl_symbol, geno.genoparts['QTL']) f.addTaxonToFeature(g, taxon_id) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_id, 'CHR') # add a version of the chromosome which is defined as the genetic map build_id = 'MONARCH:'+common_name.strip()+'-linkage' build_label = common_name+' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_id) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance(chromosome, build_id, build_label, chrom_id) start = stop = None if re.search('-', range_cm): range_parts = re.split('-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and range_parts[0] != '' and range_parts[1] != '': (start, stop) = [int(float(x.strip())) for x in re.split('-', range_cm)] else: logger.info("There's a cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop when schema can handle floats # add in the genetic location based on the range f.addFeatureStartLocation(start, chrom_in_build_id, None, [Feature.types['FuzzyPosition']]) f.addFeatureEndLocation(stop, chrom_in_build_id, None, [Feature.types['FuzzyPosition']]) f.addFeatureToGraph(g) # sometimes there's a peak marker, like a rsid. we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and re.match('rs', peak_mark.strip()): dbsnp_id = 'dbSNP:'+peak_mark.strip() gu.addIndividualToGraph(g, dbsnp_id, None, geno.genoparts['sequence_alteration']) gu.addXref(g, qtl_id, dbsnp_id) if gene_id is not None and gene_id != '' and gene_id != '.': if gene_id_src == 'NCBIgene' or gene_id_src == '': # we assume if no src is provided, it's NCBI gene_id = 'NCBIGene:'+gene_id.strip() geno.addGene(gene_id, None) # we will expect that these labels provided elsewhere geno.addAlleleOfGene(qtl_id, gene_id, geno.object_properties['feature_to_gene_relation']) # FIXME what is the right relationship here? if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = '_' + re.sub(':', '', gene_id) + '-' + peak_mark if self.nobnodes: vl_id = ':' + vl_id geno.addSequenceAlterationToVariantLocus(dbsnp_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) # add the trait gu.addClassToGraph(g, trait_id, trait_name) # Add publication r = None if re.match('ISU.*', pubmed_id): pub_id = 'AQTLPub:'+pubmed_id.strip() r = Reference(pub_id) elif pubmed_id != '': pub_id = 'PMID:'+pubmed_id.strip() r = Reference(pub_id, Reference.ref_types['journal_article']) if r is not None: r.addRefToGraph(g) # make the association to the QTL assoc = G2PAssoc(self.name, qtl_id, trait_id, gu.object_properties['is_marker_for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': score = float(re.sub('<', '', p_values)) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph(g) # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc(self.name, dbsnp_id, trait_id, gu.object_properties['is_marker_for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': score = float(re.sub('<', '', p_values)) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph(g) if not self.testMode and limit is not None and line_counter > limit: break logger.info("Done with QTL genetic info") return
def _create_genome_builds(self): """ Various resources will map variations to either UCSC (hg*) or to NCBI assemblies. Here we create the equivalences between them. Data taken from: https://genome.ucsc.edu/FAQ/FAQreleases.html#release1 :return: """ # TODO add more species ucsc_assembly_id_map = { "9606": { "UCSC:hg38": "NCBIGenome:GRCh38", "UCSC:hg19": "NCBIGenome:GRCh37", "UCSC:hg18": "NCBIGenome:36.1", "UCSC:hg17": "NCBIGenome:35", "UCSC:hg16": "NCBIGenome:34", "UCSC:hg15": "NCBIGenome:33", }, "7955": { "UCSC:danRer10": "NCBIGenome:GRCz10", "UCSC:danRer7": "NCBIGenome:Zv9", "UCSC:danRer6": "NCBIGenome:Zv8", }, "10090": { "UCSC:mm10": "NCBIGenome:GRCm38", "UCSC:mm9": "NCBIGenome:37" }, "9031": { "UCSC:galGal4": "NCBIAssembly:317958", }, "9913": { "UCSC:bosTau7": "NCBIAssembly:GCF_000003205.5", }, "9823": { "UCSC:susScr3": "NCBIAssembly:304498", }, "9940": { "UCSC:oviAri3": "NCBIAssembly:GCF_000298735.1", }, "9796": { "UCSC:equCab2": "NCBIAssembly:GCF_000002305.2", } } g = self.graph geno = Genotype(g) model = Model(g) logger.info("Adding equivalent assembly identifiers") for sp in ucsc_assembly_id_map: tax_num = sp tax_id = 'NCBITaxon:'+tax_num mappings = ucsc_assembly_id_map[sp] for i in mappings: ucsc_id = i ucsc_label = re.split(':', i)[1] mapped_id = mappings[i] mapped_label = re.split(':', mapped_id)[1] mapped_label = 'NCBI build '+str(mapped_label) geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id) geno.addReferenceGenome(mapped_id, mapped_label, tax_id) model.addSameIndividual(ucsc_id, mapped_id) return
def _get_variants(self, limit): """ Currently loops through the variant_summary file. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) f = Feature(g, None, None, None) # add the taxon and the genome tax_num = '9606' # HARDCODE tax_id = 'NCBITaxon:' + tax_num tax_label = 'Human' model.addClassToGraph(tax_id, None) geno.addGenome(tax_id, tax_label) # label gets added elsewhere # not unzipping the file logger.info("Processing Variant records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['variant_summary']['file'])) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # AlleleID integer value as stored in the AlleleID field in ClinVar (//Measure/@ID in the XML) # Type character, the type of variation # Name character, the preferred name for the variation # GeneID integer, GeneID in NCBI's Gene database # GeneSymbol character, comma-separated list of GeneIDs overlapping the variation # ClinicalSignificance character, comma-separated list of values of clinical significance reported for this variation # for the mapping between the terms listed here and the integers in the .VCF files, see # http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/ # RS# (dbSNP) integer, rs# in dbSNP # nsv (dbVar) character, the NSV identifier for the region in dbVar # RCVaccession character, list of RCV accessions that report this variant # TestedInGTR character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR) # PhenotypeIDs character, list of db names and identifiers for phenotype(s) reported for this variant # Origin character, list of all allelic origins for this variation # Assembly character, name of the assembly on which locations are based # Chromosome character, chromosomal location # Start integer, starting location, in pter->qter orientation # Stop integer, end location, in pter->qter orientation # Cytogenetic character, ISCN band # ReviewStatus character, highest review status for reporting this measure. For the key to the terms, # and their relationship to the star graphics ClinVar displays on its web pages, # see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation # HGVS(c.) character, RefSeq cDNA-based HGVS expression # HGVS(p.) character, RefSeq protein-based HGVS expression # NumberSubmitters integer, number of submissions with this variant # LastEvaluated datetime, the latest time any submitter reported clinical significance # Guidelines character, ACMG only right now, for the reporting of incidental variation in a Gene # (NOTE: if ACMG, not a specific to the allele but to the Gene) # OtherIDs character, list of other identifiers or sources of information about this variant # VariantID integer, the value used to build the URL for the current default report, # e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/ # # a crude check that there's an expected number of cols. # if not, error out because something changed. num_cols = len(line.split('\t')) expected_numcols = 29 if num_cols != expected_numcols: logger.error( "Unexpected number of columns in raw file " + "(%d actual vs %d expected)", num_cols, expected_numcols) (allele_num, allele_type, allele_name, gene_num, gene_symbol, clinical_significance, dbsnp_num, dbvar_num, rcv_nums, tested_in_gtr, phenotype_ids, origin, assembly, chr, start, stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p, number_of_submitters, last_eval, guidelines, other_ids, variant_num, reference_allele, alternate_allele, categories, ChromosomeAccession) = line.split('\t') # ###set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and\ # (int(tax_num) not in self.tax_ids)) or\ # (self.filter == 'geneids' and\ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter line_counter += 1 pheno_list = [] if phenotype_ids != '-': # trim any leading/trailing semicolons/commas phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids) phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids) pheno_list = re.split(r'[,;]', phenotype_ids) if self.testMode: # get intersection of test disease ids # and these phenotype_ids intersect = \ list( set([str(i) for i in self.disease_ids]) & set(pheno_list)) if int(gene_num) not in self.gene_ids and\ int(variant_num) not in self.variant_ids and\ len(intersect) < 1: continue # TODO may need to switch on assembly to create correct # assembly/build identifiers build_id = ':'.join(('NCBIGenome', assembly)) # make the reference genome build geno.addReferenceGenome(build_id, assembly, tax_id) allele_type_id = self._map_type_of_allele(allele_type) bandinbuild_id = None if str(chr) == '': # check cytogenic location if str(cytogenetic_loc).strip() != '': # use cytogenic location to get the apx location # oddly, they still put an assembly number even when # there's no numeric location if not re.search(r'-', str(cytogenetic_loc)): band_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), tax_num, 'CHR') geno.addChromosomeInstance(cytogenetic_loc, build_id, assembly, band_id) bandinbuild_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), assembly, 'MONARCH') else: # can't deal with ranges yet pass else: # add the human chromosome class to the graph, # and add the build-specific version of it chr_id = makeChromID(str(chr), tax_num, 'CHR') geno.addChromosomeClass(str(chr), tax_id, tax_label) geno.addChromosomeInstance(str(chr), build_id, assembly, chr_id) chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH') seqalt_id = ':'.join(('ClinVarVariant', variant_num)) gene_id = None # they use -1 to indicate unknown gene if str(gene_num) != '-1' and str(gene_num) != 'more than 10': if re.match(r'^Gene:', gene_num): gene_num = "NCBI" + gene_num else: gene_id = ':'.join(('NCBIGene', str(gene_num))) # FIXME there are some "variants" that are actually haplotypes # probably will get taken care of when we switch to processing # the xml for example, variant_num = 38562 # but there's no way to tell if it's a haplotype # in the csv data so the dbsnp or dbvar # should probably be primary, # and the variant num be the vslc, # with each of the dbsnps being added to it # TODO clinical significance needs to be mapped to # a list of terms # first, make the variant: f = Feature(seqalt_id, allele_name, allele_type_id) if start != '-' and start.strip() != '': f.addFeatureStartLocation(start, chrinbuild_id) if stop != '-' and stop.strip() != '': f.addFeatureEndLocation(stop, chrinbuild_id) f.addFeatureToGraph() f.addTaxonToFeature(tax_id) # make the ClinVarVariant the clique leader model.makeLeader(seqalt_id) if bandinbuild_id is not None: f.addSubsequenceOfFeature(bandinbuild_id) # CHECK - this makes the assumption that there is # only one affected chromosome per variant what happens with # chromosomal rearrangement variants? # shouldn't both chromosomes be here? # add the hgvs as synonyms if hgvs_c != '-' and hgvs_c.strip() != '': model.addSynonym(seqalt_id, hgvs_c) if hgvs_p != '-' and hgvs_p.strip() != '': model.addSynonym(seqalt_id, hgvs_p) # add the dbsnp and dbvar ids as equivalent if dbsnp_num != '-' and int(dbsnp_num) != -1: dbsnp_id = 'dbSNP:rs' + str(dbsnp_num) model.addIndividualToGraph(dbsnp_id, None) model.addSameIndividual(seqalt_id, dbsnp_id) if dbvar_num != '-': dbvar_id = 'dbVar:' + dbvar_num model.addIndividualToGraph(dbvar_id, None) model.addSameIndividual(seqalt_id, dbvar_id) # TODO - not sure if this is right... add as xref? # the rcv is like the combo of the phenotype with the variant if rcv_nums != '-': for rcv_num in re.split(r';', rcv_nums): rcv_id = 'ClinVar:' + rcv_num model.addIndividualToGraph(rcv_id, None) model.addXref(seqalt_id, rcv_id) if gene_id is not None: # add the gene model.addClassToGraph(gene_id, gene_symbol) # make a variant locus vl_id = '_' + gene_num + '-' + variant_num if self.nobnodes: vl_id = ':' + vl_id vl_label = allele_name model.addIndividualToGraph(vl_id, vl_label, geno.genoparts['variant_locus']) geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: # some basic reporting gmatch = re.search(r'\(\w+\)', allele_name) if gmatch is not None and len(gmatch.groups()) > 0: logger.info( "Gene found in allele label, but no id provided: %s", gmatch.group(1)) elif re.match(r'more than 10', gene_symbol): logger.info( "More than 10 genes found; " "need to process XML to fetch (variant=%d)", int(variant_num)) else: logger.info("No gene listed for variant %d", int(variant_num)) # parse the list of "phenotypes" which are diseases. # add them as an association # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374 # the list is both semicolon delimited and comma delimited, # but i don't know why! some are bad, like: # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000 if phenotype_ids != '-': for phenotype in pheno_list: m = re.match(r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype) if m is not None and len(m.groups()) > 0: phenotype = re.sub(m.group(1), 'Orphanet:', phenotype.strip()) elif re.match(r'ORPHA:\d+', phenotype): phenotype = re.sub(r'^ORPHA', 'Orphanet', phenotype.strip()) elif re.match(r'Human Phenotype Ontology', phenotype): phenotype = re.sub(r'^Human Phenotype Ontology', '', phenotype.strip()) elif re.match(r'SNOMED CT:\s?', phenotype): phenotype = re.sub(r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip()) elif re.match(r'^Gene:', phenotype): continue assoc = G2PAssoc(g, self.name, seqalt_id, phenotype.strip()) assoc.add_association_to_graph() if other_ids != '-': id_list = other_ids.split(',') # process the "other ids" ex: # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001 # TODO make more xrefs for xrefid in id_list: prefix = xrefid.split(':')[0].strip() if prefix == 'OMIM Allelic Variant': xrefid = 'OMIM:' + xrefid.split(':')[1] model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'HGMD': model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'dbVar' \ and dbvar_num == xrefid.split(':')[1].strip(): pass # skip over this one elif re.search(r'\s', prefix): pass # logger.debug( # 'xref prefix has a space: %s', xrefid) else: # should be a good clean prefix # note that HGMD variants are in here as Xrefs # because we can't resolve URIs for them # logger.info("Adding xref: %s", xrefid) # gu.addXref(g, seqalt_id, xrefid) # logger.info("xref prefix to add: %s", xrefid) pass if not self.testMode and limit is not None \ and line_counter > limit: break logger.info("Finished parsing variants") return
def _process_qtls_genetic_location( self, raw, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ if self.testMode: graph = self.testgraph else: graph = self.graph line_counter = 0 geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm, flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model_id, test_base, sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio, trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row if self.testMode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = 'AQTLTrait:' + trait_id.strip() # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:'+common_name.strip()+'-linkage' build_label = common_name+' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm)] else: LOG.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:'+peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref(qtl_id, dbsnp_id) gene_id = gene_id.replace('uncharacterized ', '').strip() if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error( ' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = '_:' + re.sub( r':', '', gene_id) + '-' + peak_mark.strip() geno.addSequenceAlterationToVariantLocus( dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph(trait_id, trait_name) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:'+pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() if not self.testMode and limit is not None and line_counter > limit: break LOG.info("Done with QTL genetic info") return
def _get_variants(self, limit): """ Currently loops through the variant_summary file. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) f = Feature(g, None, None, None) # add the taxon and the genome tax_num = '9606' # HARDCODE tax_id = 'NCBITaxon:'+tax_num tax_label = 'Human' model.addClassToGraph(tax_id, None) geno.addGenome(tax_id, tax_label) # label gets added elsewhere # not unzipping the file logger.info("Processing Variant records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['variant_summary']['file'])) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # AlleleID integer value as stored in the AlleleID field in ClinVar (//Measure/@ID in the XML) # Type character, the type of variation # Name character, the preferred name for the variation # GeneID integer, GeneID in NCBI's Gene database # GeneSymbol character, comma-separated list of GeneIDs overlapping the variation # ClinicalSignificance character, comma-separated list of values of clinical significance reported for this variation # for the mapping between the terms listed here and the integers in the .VCF files, see # http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/ # RS# (dbSNP) integer, rs# in dbSNP # nsv (dbVar) character, the NSV identifier for the region in dbVar # RCVaccession character, list of RCV accessions that report this variant # TestedInGTR character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR) # PhenotypeIDs character, list of db names and identifiers for phenotype(s) reported for this variant # Origin character, list of all allelic origins for this variation # Assembly character, name of the assembly on which locations are based # Chromosome character, chromosomal location # Start integer, starting location, in pter->qter orientation # Stop integer, end location, in pter->qter orientation # Cytogenetic character, ISCN band # ReviewStatus character, highest review status for reporting this measure. For the key to the terms, # and their relationship to the star graphics ClinVar displays on its web pages, # see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation # HGVS(c.) character, RefSeq cDNA-based HGVS expression # HGVS(p.) character, RefSeq protein-based HGVS expression # NumberSubmitters integer, number of submissions with this variant # LastEvaluated datetime, the latest time any submitter reported clinical significance # Guidelines character, ACMG only right now, for the reporting of incidental variation in a Gene # (NOTE: if ACMG, not a specific to the allele but to the Gene) # OtherIDs character, list of other identifiers or sources of information about this variant # VariantID integer, the value used to build the URL for the current default report, # e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/ # # a crude check that there's an expected number of cols. # if not, error out because something changed. num_cols = len(line.split('\t')) expected_numcols = 29 if num_cols != expected_numcols: logger.error( "Unexpected number of columns in raw file " + "(%d actual vs %d expected)", num_cols, expected_numcols) (allele_num, allele_type, allele_name, gene_num, gene_symbol, clinical_significance, dbsnp_num, dbvar_num, rcv_nums, tested_in_gtr, phenotype_ids, origin, assembly, chr, start, stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p, number_of_submitters, last_eval, guidelines, other_ids, variant_num, reference_allele, alternate_allele, categories, ChromosomeAccession) = line.split('\t') # ###set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and\ # (int(tax_num) not in self.tax_ids)) or\ # (self.filter == 'geneids' and\ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter line_counter += 1 pheno_list = [] if phenotype_ids != '-': # trim any leading/trailing semicolons/commas phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids) phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids) pheno_list = re.split(r'[,;]', phenotype_ids) if self.testMode: # get intersection of test disease ids # and these phenotype_ids intersect = \ list( set([str(i) for i in self.disease_ids]) & set(pheno_list)) if int(gene_num) not in self.gene_ids and\ int(variant_num) not in self.variant_ids and\ len(intersect) < 1: continue # TODO may need to switch on assembly to create correct # assembly/build identifiers build_id = ':'.join(('NCBIGenome', assembly)) # make the reference genome build geno.addReferenceGenome(build_id, assembly, tax_id) allele_type_id = self._map_type_of_allele(allele_type) bandinbuild_id = None if str(chr) == '': # check cytogenic location if str(cytogenetic_loc).strip() != '': # use cytogenic location to get the apx location # oddly, they still put an assembly number even when # there's no numeric location if not re.search(r'-', str(cytogenetic_loc)): band_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), tax_num, 'CHR') geno.addChromosomeInstance( cytogenetic_loc, build_id, assembly, band_id) bandinbuild_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), assembly, 'MONARCH') else: # can't deal with ranges yet pass else: # add the human chromosome class to the graph, # and add the build-specific version of it chr_id = makeChromID(str(chr), tax_num, 'CHR') geno.addChromosomeClass(str(chr), tax_id, tax_label) geno.addChromosomeInstance( str(chr), build_id, assembly, chr_id) chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH') seqalt_id = ':'.join(('ClinVarVariant', variant_num)) gene_id = None # they use -1 to indicate unknown gene if str(gene_num) != '-1' and str(gene_num) != 'more than 10': if re.match(r'^Gene:', gene_num): gene_num = "NCBI" + gene_num else: gene_id = ':'.join(('NCBIGene', str(gene_num))) # FIXME there are some "variants" that are actually haplotypes # probably will get taken care of when we switch to processing # the xml for example, variant_num = 38562 # but there's no way to tell if it's a haplotype # in the csv data so the dbsnp or dbvar # should probably be primary, # and the variant num be the vslc, # with each of the dbsnps being added to it # TODO clinical significance needs to be mapped to # a list of terms # first, make the variant: f = Feature(seqalt_id, allele_name, allele_type_id) if start != '-' and start.strip() != '': f.addFeatureStartLocation(start, chrinbuild_id) if stop != '-' and stop.strip() != '': f.addFeatureEndLocation(stop, chrinbuild_id) f.addFeatureToGraph() f.addTaxonToFeature(tax_id) # make the ClinVarVariant the clique leader model.makeLeader(seqalt_id) if bandinbuild_id is not None: f.addSubsequenceOfFeature(bandinbuild_id) # CHECK - this makes the assumption that there is # only one affected chromosome per variant what happens with # chromosomal rearrangement variants? # shouldn't both chromosomes be here? # add the hgvs as synonyms if hgvs_c != '-' and hgvs_c.strip() != '': model.addSynonym(seqalt_id, hgvs_c) if hgvs_p != '-' and hgvs_p.strip() != '': model.addSynonym(seqalt_id, hgvs_p) # add the dbsnp and dbvar ids as equivalent if dbsnp_num != '-' and int(dbsnp_num) != -1: dbsnp_id = 'dbSNP:rs'+str(dbsnp_num) model.addIndividualToGraph(dbsnp_id, None) model.addSameIndividual(seqalt_id, dbsnp_id) if dbvar_num != '-': dbvar_id = 'dbVar:'+dbvar_num model.addIndividualToGraph(dbvar_id, None) model.addSameIndividual(seqalt_id, dbvar_id) # TODO - not sure if this is right... add as xref? # the rcv is like the combo of the phenotype with the variant if rcv_nums != '-': for rcv_num in re.split(r';', rcv_nums): rcv_id = 'ClinVar:' + rcv_num model.addIndividualToGraph(rcv_id, None) model.addXref(seqalt_id, rcv_id) if gene_id is not None: # add the gene model.addClassToGraph(gene_id, gene_symbol) # make a variant locus vl_id = '_'+gene_num+'-'+variant_num if self.nobnodes: vl_id = ':'+vl_id vl_label = allele_name model.addIndividualToGraph( vl_id, vl_label, geno.genoparts['variant_locus']) geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: # some basic reporting gmatch = re.search(r'\(\w+\)', allele_name) if gmatch is not None and len(gmatch.groups()) > 0: logger.info( "Gene found in allele label, but no id provided: %s", gmatch.group(1)) elif re.match(r'more than 10', gene_symbol): logger.info( "More than 10 genes found; " "need to process XML to fetch (variant=%d)", int(variant_num)) else: logger.info( "No gene listed for variant %d", int(variant_num)) # parse the list of "phenotypes" which are diseases. # add them as an association # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374 # the list is both semicolon delimited and comma delimited, # but i don't know why! some are bad, like: # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000 if phenotype_ids != '-': for phenotype in pheno_list: m = re.match( r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype) if m is not None and len(m.groups()) > 0: phenotype = re.sub( m.group(1), 'Orphanet:', phenotype.strip()) elif re.match(r'ORPHA:\d+', phenotype): phenotype = re.sub( r'^ORPHA', 'Orphanet', phenotype.strip()) elif re.match(r'Human Phenotype Ontology', phenotype): phenotype = re.sub( r'^Human Phenotype Ontology', '', phenotype.strip()) elif re.match(r'SNOMED CT:\s?', phenotype): phenotype = re.sub( r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip()) elif re.match(r'^Gene:', phenotype): continue assoc = G2PAssoc( g, self.name, seqalt_id, phenotype.strip()) assoc.add_association_to_graph() if other_ids != '-': id_list = other_ids.split(',') # process the "other ids" ex: # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001 # TODO make more xrefs for xrefid in id_list: prefix = xrefid.split(':')[0].strip() if prefix == 'OMIM Allelic Variant': xrefid = 'OMIM:'+xrefid.split(':')[1] model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'HGMD': model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'dbVar' \ and dbvar_num == xrefid.split(':')[1].strip(): pass # skip over this one elif re.search(r'\s', prefix): pass # logger.debug( # 'xref prefix has a space: %s', xrefid) else: # should be a good clean prefix # note that HGMD variants are in here as Xrefs # because we can't resolve URIs for them # logger.info("Adding xref: %s", xrefid) # gu.addXref(g, seqalt_id, xrefid) # logger.info("xref prefix to add: %s", xrefid) pass if not self.testMode and limit is not None \ and line_counter > limit: break logger.info("Finished parsing variants") return
def _add_variant_cdna_variant_assoc_to_graph(self, row): """ Generates relationships between variants and cDNA variants given a row of data :param iterable: row of data, see add_variant_info_to_graph() docstring for expected structure. Only applicable for structure 2. :return None """ gu = GraphUtils(curie_map.get()) geno = Genotype(self.graph) is_literal = True (variant_key, variant_label, amino_acid_variant, amino_acid_position, transcript_id, transcript_priority, protein_variant_type, functional_impact, stop_gain_loss, transcript_gene, protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id, db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base, primary_transcript_exons, primary_transcript_variant_sub_types, variant_type, chromosome, genome_build, build_version, build_date) = row variant_id = self.make_cgd_id('variant{0}'.format(variant_key)) # Add gene self._add_variant_gene_relationship(variant_id, variant_gene) # Transcript reference for nucleotide position transcript_curie = self._make_transcript_curie(transcript_id) # Make region IDs cdna_region_id = ":_{0}Region".format(transcript_curie) chrom_region_id = ":_{0}{1}Region-{2}-{3}".format(genome_build, chromosome, genome_pos_start, genome_pos_end) # Add the genome build genome_label = "Human" build_id = "UCSC:{0}".format(genome_build) taxon_id = 'NCBITaxon:9606' geno.addGenome(taxon_id, genome_label) geno.addReferenceGenome(build_id, genome_build, taxon_id) # Add chromosome chrom_class_id = makeChromID(chromosome, '9606', 'CHR') # the chrom class (generic) id chrom_instance_id = makeChromID(chromosome, build_id, 'MONARCH') # first, add the chromosome class (in the taxon) geno.addChromosomeClass(chromosome, taxon_id, 'Human') # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chromosome, build_id, genome_build, chrom_class_id) # Add variant coordinates in reference to chromosome self._add_feature_with_coords(variant_id,genome_pos_start, genome_pos_end, chrom_instance_id, chrom_region_id) # Add mutation coordinates in reference to gene self._add_feature_with_coords(variant_id, bp_pos, bp_pos, transcript_curie, cdna_region_id) # Add nucleotide mutation gu.addTriple(self.graph, variant_id, geno.properties['reference_nucleotide'], ref_base, is_literal) gu.addTriple(self.graph, variant_id, geno.properties['altered_nucleotide'], variant_base, is_literal) """ Here we update any internal cgd variant IDS with a cosmic ID or dbSNP ID. Alternatively we could do this using sql rather than a sparql update which may be safer """ # Add SNP xrefs if cosmic_id is not None: cosmic_id_list = cosmic_id.split(', ') cosmic_curie_list = [] for c_id in cosmic_id_list: cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id) cosmic_curie_list.append(cosmic_curie) gu.addIndividualToGraph(self.graph, cosmic_curie, c_id, geno.genoparts['missense_variant']) # If there are multiple ids set them equivalent to the first for curie in cosmic_curie_list[1:]: gu.addSameIndividual(self.graph, cosmic_curie_list[0], curie) self._replace_entity(self.graph, variant_id, cosmic_curie_list[0], self.bindings) if db_snp_id is not None: db_snp_curie = re.sub(r'rs(\d+)', r'dbSNP:\1', db_snp_id) gu.addIndividualToGraph(self.graph, db_snp_curie, db_snp_id, geno.genoparts['missense_variant']) if cosmic_id is None: self._replace_entity(self.graph, variant_id, db_snp_curie, self.bindings) else: cosmic_id_list = cosmic_id.split(', ') for c_id in cosmic_id_list: cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id) gu.addSameIndividual(self.graph, cosmic_curie, db_snp_curie) return