def _get_chrbands(self, limit, taxon): """ :param limit: :return: """ model = Model(self.graph) # TODO PYLINT figure out what limit was for and why it is unused line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom(self.graph_type, self.are_bnodes_skized) # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:'+taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) geno.addGenome(taxon_id, genome_label) # add the build and the taxon it's in build_num = self.files[taxon]['build_num'] build_id = 'UCSC:'+build_num geno.addReferenceGenome(build_id, build_num, taxon_id) # process the bands with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue # chr13 4500000 10000000 p12 stalk (scaffold, start, stop, band_num, rtype) = line.split('\t') line_counter += 1 # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))' unlocalized_scaffold_pattern = \ placed_scaffold_pattern+r'_(\w+)_random' unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)' m = re.match(placed_scaffold_pattern+r'$', scaffold) if m is not None and len(m.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = m.group(1) else: # skip over anything that isn't a placed_scaffold # at the class level logger.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold) m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold) scaffold_num = None if m: pass elif m_chr_unloc is not None and\ len(m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num+'_'+m_chr_unloc.group(2) elif m_chr_unplaced is not None and\ len(m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) else: logger.error( "There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass( chrom_num, taxon_id, self.files[taxon]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = { 'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': Feature.types['chromosome']} if scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = { 'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': Feature.types['assembly_component'], 'synonym': scaffold} if band_num is not None and band_num.strip() != '': # add the specific band mybands[chrom_num+band_num] = {'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': None} # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): mybands[chrom_num+band_num]['stain'] = \ Feature.types.get(rtype) # get the parent bands, and make them unique parents = list( monochrom.make_parent_bands(band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num+band_num]['parent'] = \ chrom_num+parents[0] else: # TODO PYLINT why is 'parent' # a list() a couple of lines up and a set() here? parents = set() # loop through the parents and add them to the hash # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i]) pnum = chrom_num+parents[i] sta = int(start) sto = int(stop) if pnum not in mybands.keys(): # add the parental band to the hash b = {'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti} mybands[pnum] = b else: # band already in the hash means it's a grouping band # need to update the min/max coords b = mybands.get(pnum) b['min'] = min(sta, sto, b['min']) b['max'] = max(sta, sto, b['max']) mybands[pnum] = b # also, set the max for the chrom c = mybands.get(chrom_num) c['max'] = max(sta, sto, c['max']) mybands[chrom_num] = c # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num+parents[i+1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num f.close() # end looping through file # loop through the hash and add the bands to the graph for b in mybands.keys(): myband = mybands.get(b) band_class_id = makeChromID(b, taxon, 'CHR') band_class_label = makeChromLabel(b, genome_label) band_build_id = makeChromID(b, build_num, 'MONARCH') band_build_label = makeChromLabel(b, build_num) # the build-specific chrom chrom_in_build_id = makeChromID( myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != Feature.types['assembly_component']: model.addClassToGraph(band_class_id, band_class_label, myband['type']) bfeature = Feature(self.graph, band_build_id, band_build_label, band_class_id) else: bfeature = Feature(self.graph, band_build_id, band_build_label, myband['type']) if 'synonym' in myband: model.addSynonym(band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == Feature.types['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == Feature.types['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: # TODO 'has_staining_intensity' being dropped by MB bfeature.addFeatureProperty( Feature.properties['has_staining_intensity'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(False) return
def _get_chrbands(self, limit, src_key, genome_id): """ :param limit: :return: """ tax_num = src_key if limit is None: limit = sys.maxsize # practical limit anyway model = Model(self.graph) line_num = 0 myfile = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom(self.graph_type, self.are_bnodes_skized) # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[src_key]['genome_label'] taxon_curie = 'NCBITaxon:' + tax_num species_name = self.globaltcid[taxon_curie] # for logging # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_curie, None) model.addSynonym(taxon_curie, genome_label) geno.addGenome(taxon_curie, genome_label, genome_id) # add the build and the taxon it's in build_num = self.files[src_key]['build_num'] build_id = 'UCSC:' + build_num geno.addReferenceGenome(build_id, build_num, taxon_curie) # cat (at least) also has chr[BDAECF]... hex? must be a back cat. if tax_num == self.localtt['Felis catus']: placed_scaffold_regex = re.compile( r'(chr(?:[BDAECF]\d+|X|Y|Z|W|M|))$') else: placed_scaffold_regex = re.compile(r'(chr(?:\d+|X|Y|Z|W|M))$') unlocalized_scaffold_regex = re.compile(r'_(\w+)_random') unplaced_scaffold_regex = re.compile(r'chr(Un(?:_\w+)?)') # process the bands col = self.files[src_key]['columns'] with gzip.open(myfile, 'rb') as binreader: for line in binreader: line_num += 1 # skip comments line = line.decode().strip() if line[0] == '#' or line_num > limit: continue # chr13 4500000 10000000 p12 stalk row = line.split('\t') scaffold = row[col.index('chrom')].strip() start = row[col.index('chromStart')] stop = row[col.index('chromEnd')] band_num = row[col.index('name')].strip() rtype = row[col.index('gieStain')] # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 mch = placed_scaffold_regex.match(scaffold) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = mch.group(1) else: # skip over anything that isn't a placed_scaffold at the class level # LOG.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = unlocalized_scaffold_regex.match(scaffold) m_chr_unplaced = unplaced_scaffold_regex.match(scaffold) scaffold_num = None if mch: pass elif m_chr_unloc is not None and len( m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num + '_' + m_chr_unloc.group(2) elif m_chr_unplaced is not None and len( m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) # else: # LOG.error( # "There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, tax_num, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass( chrom_num, taxon_curie, self.files[src_key]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = { 'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': self.globaltt['chromosome'] } elif scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = { 'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': self.globaltt['assembly_component'], 'synonym': scaffold } else: LOG.info('%s line %i DROPPED chromosome/scaffold %s', species_name, line_num, scaffold) parents = list() # see it new types have showed up if rtype is not None and rtype not in [ 'gneg', 'gpos25', 'gpos33', 'gpos50', 'gpos66', 'gpos75', 'gpos100', 'acen', 'gvar', 'stalk' ]: LOG.info('Unknown gieStain type "%s" in %s at %i', rtype, src_key, line_num) self.globaltt[rtype] # blow up if rtype == 'acen': # hacky, revisit if ontology improves rtype = self.localtt[rtype] if band_num is not None and band_num != '' and \ rtype is not None and rtype != '': # add the specific band mybands[chrom_num + band_num] = { 'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': self.globaltt[rtype], } # add the staining intensity of the band # get the parent bands, and make them unique parents = list(monochrom.make_parent_bands( band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num + band_num]['parent'] = chrom_num + parents[0] # else: # band has no parents # loop through the parents and add them to the dict # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i], self.graph) pnum = chrom_num + parents[i] sta = int(start) sto = int(stop) if pnum is not None and pnum not in mybands.keys(): # add the parental band to the hash bnd = { 'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti } mybands[pnum] = bnd elif pnum is not None: # band already in the hash means it's a grouping band # need to update the min/max coords bnd = mybands.get(pnum) bnd['min'] = min(sta, sto, bnd['min']) bnd['max'] = max(sta, sto, bnd['max']) mybands[pnum] = bnd # also, set the max for the chrom chrom = mybands.get(chrom_num) chrom['max'] = max(sta, sto, chrom['max']) mybands[chrom_num] = chrom else: LOG.error("pnum is None") # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num + parents[i + 1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num binreader.close() # end looping through file # loop through the hash and add the bands to the graph for bnd in mybands.keys(): myband = mybands.get(bnd) band_class_id = makeChromID(bnd, tax_num, 'CHR') band_class_label = makeChromLabel(bnd, genome_label) band_build_id = makeChromID(bnd, build_num, 'MONARCH') band_build_label = makeChromLabel(bnd, build_num) # the build-specific chrom chrom_in_build_id = makeChromID(myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != self.globaltt['assembly_component']: model.addClassToGraph(band_class_id, band_class_label, myband['type']) bfeature = Feature(self.graph, band_build_id, band_build_label, band_class_id) else: bfeature = Feature(self.graph, band_build_id, band_build_label, myband['type']) if 'synonym' in myband: model.addSynonym(band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == self.globaltt['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == self.globaltt['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: bfeature.addFeatureProperty( self.globaltt['has_sequence_attribute'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(False)
def _get_chrbands(self, limit, taxon): """ :param limit: :return: """ model = Model(self.graph) # TODO PYLINT figure out what limit was for and why it is unused line_counter = 0 myfile = '/'.join((self.rawdir, self.files[taxon]['file'])) logger.info("Processing Chr bands from FILE: %s", myfile) geno = Genotype(self.graph) monochrom = Monochrom(self.graph_type, self.are_bnodes_skized) # used to hold band definitions for a chr # in order to compute extent of encompasing bands mybands = {} # build the organism's genome from the taxon genome_label = self.files[taxon]['genome_label'] taxon_id = 'NCBITaxon:' + taxon # add the taxon as a class. adding the class label elsewhere model.addClassToGraph(taxon_id, None) model.addSynonym(taxon_id, genome_label) geno.addGenome(taxon_id, genome_label) # add the build and the taxon it's in build_num = self.files[taxon]['build_num'] build_id = 'UCSC:' + build_num geno.addReferenceGenome(build_id, build_num, taxon_id) # process the bands with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue # chr13 4500000 10000000 p12 stalk (scaffold, start, stop, band_num, rtype) = line.split('\t') line_counter += 1 # NOTE some less-finished genomes have # placed and unplaced scaffolds # * Placed scaffolds: # the scaffolds have been placed within a chromosome. # * Unlocalized scaffolds: # although the chromosome within which the scaffold occurs # is known, the scaffold's position or orientation # is not known. # * Unplaced scaffolds: # it is not known which chromosome the scaffold belongs to # # find out if the thing is a full on chromosome, or a scaffold: # ex: unlocalized scaffold: chr10_KL568008v1_random # ex: unplaced scaffold: chrUn_AABR07022428v1 placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))' unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random' unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)' mch = re.match(placed_scaffold_pattern + r'$', scaffold) if mch is not None and len(mch.groups()) == 1: # the chromosome is the first match of the pattern chrom_num = mch.group(1) else: # skip over anything that isn't a placed_scaffold # at the class level logger.info("Found non-placed chromosome %s", scaffold) chrom_num = None m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold) m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold) scaffold_num = None if mch: pass elif m_chr_unloc is not None and len( m_chr_unloc.groups()) == 2: chrom_num = m_chr_unloc.group(1) scaffold_num = chrom_num + '_' + m_chr_unloc.group(2) elif m_chr_unplaced is not None and len( m_chr_unplaced.groups()) == 1: scaffold_num = m_chr_unplaced.group(1) else: logger.error( "There's a chr pattern that we aren't matching: %s", scaffold) if chrom_num is not None: # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') # first, add the chromosome class (in the taxon) geno.addChromosomeClass(chrom_num, taxon_id, self.files[taxon]['genome_label']) # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chrom_num, build_id, build_num, chrom_class_id) # add the chr to the hashmap of coordinates for this build # the chromosome coordinate space is itself if chrom_num not in mybands.keys(): mybands[chrom_num] = { 'min': 0, 'max': int(stop), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': self.globaltt['chromosome'] } if scaffold_num is not None: # this will put the coordinates of the scaffold # in the scaffold-space and make sure that the scaffold # is part of the correct parent. # if chrom_num is None, # then it will attach it to the genome, # just like a reg chrom mybands[scaffold_num] = { 'min': start, 'max': stop, 'chr': scaffold_num, 'ref': build_id, 'parent': chrom_num, 'stain': None, 'type': self.globaltt['assembly_component'], 'synonym': scaffold } if band_num is not None and band_num.strip() != '': # add the specific band mybands[chrom_num + band_num] = { 'min': start, 'max': stop, 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': None } # add the staining intensity of the band if re.match(r'g(neg|pos|var)', rtype): mybands[chrom_num + band_num]['stain'] = self.resolve(rtype) # get the parent bands, and make them unique parents = list(monochrom.make_parent_bands( band_num, set())) # alphabetical sort will put them in smallest to biggest, # so we reverse parents.sort(reverse=True) # print('parents of',chrom,band,':',parents) if len(parents) > 0: mybands[chrom_num + band_num]['parent'] = chrom_num + parents[0] else: # TODO PYLINT why is 'parent' # a list() a couple of lines up and a set() here? parents = set() # loop through the parents and add them to the hash # add the parents to the graph, in hierarchical order # TODO PYLINT Consider using enumerate # instead of iterating with range and len for i in range(len(parents)): rti = getChrPartTypeByNotation(parents[i]) pnum = chrom_num + parents[i] sta = int(start) sto = int(stop) if pnum not in mybands.keys(): # add the parental band to the hash bnd = { 'min': min(sta, sto), 'max': max(sta, sto), 'chr': chrom_num, 'ref': build_id, 'parent': None, 'stain': None, 'type': rti } mybands[pnum] = bnd else: # band already in the hash means it's a grouping band # need to update the min/max coords bnd = mybands.get(pnum) bnd['min'] = min(sta, sto, bnd['min']) bnd['max'] = max(sta, sto, bnd['max']) mybands[pnum] = bnd # also, set the max for the chrom chrom = mybands.get(chrom_num) chrom['max'] = max(sta, sto, chrom['max']) mybands[chrom_num] = chrom # add the parent relationships to each if i < len(parents) - 1: mybands[pnum]['parent'] = chrom_num + parents[i + 1] else: # add the last one (p or q usually) # as attached to the chromosome mybands[pnum]['parent'] = chrom_num f.close() # end looping through file # loop through the hash and add the bands to the graph for bnd in mybands.keys(): myband = mybands.get(bnd) band_class_id = makeChromID(bnd, taxon, 'CHR') band_class_label = makeChromLabel(bnd, genome_label) band_build_id = makeChromID(bnd, build_num, 'MONARCH') band_build_label = makeChromLabel(bnd, build_num) # the build-specific chrom chrom_in_build_id = makeChromID(myband['chr'], build_num, 'MONARCH') # if it's != part, then add the class if myband['type'] != self.globaltt['assembly_component']: model.addClassToGraph(band_class_id, band_class_label, myband['type']) bfeature = Feature(self.graph, band_build_id, band_build_label, band_class_id) else: bfeature = Feature(self.graph, band_build_id, band_build_label, myband['type']) if 'synonym' in myband: model.addSynonym(band_build_id, myband['synonym']) if myband['parent'] is None: if myband['type'] == self.globaltt['assembly_component']: # since we likely don't know the chr, # add it as a part of the build geno.addParts(band_build_id, build_id) elif myband['type'] == self.globaltt['assembly_component']: # geno.addParts(band_build_id, chrom_in_build_id) parent_chrom_in_build = makeChromID(myband['parent'], build_num, 'MONARCH') bfeature.addSubsequenceOfFeature(parent_chrom_in_build) # add the band as a feature # (which also instantiates the owl:Individual) bfeature.addFeatureStartLocation(myband['min'], chrom_in_build_id) bfeature.addFeatureEndLocation(myband['max'], chrom_in_build_id) if 'stain' in myband and myband['stain'] is not None: bfeature.addFeatureProperty( self.globaltt['has_sequence_attribute'], myband['stain']) # type the band as a faldo:Region directly (add_region=False) # bfeature.setNoBNodes(self.nobnodes) # to come when we merge in ZFIN.py bfeature.addFeatureToGraph(False) return