def set_format_scatterplot(axScatter, **kwargs): min_x, max_x = None, None min_y, max_y = None, None if kwargs['plot'] == 'blobplot': min_x, max_x = 0, 1 major_xticks = MultipleLocator(0.2) minor_xticks = AutoMinorLocator(20) min_y, max_y = kwargs['min_cov']*0.1, kwargs['max_cov']+1000 axScatter.set_yscale('log') axScatter.set_xscale('linear') axScatter.xaxis.set_major_locator(major_xticks) axScatter.xaxis.set_minor_locator(minor_xticks) elif kwargs['plot'] == 'covplot': min_x, max_x = kwargs['min_cov']*0.1, kwargs['max_cov']+1000 min_y, max_y = kwargs['min_cov']*0.1, kwargs['max_cov']+1000 axScatter.set_yscale('log') axScatter.set_xscale('log') else: BtLog.error('34' % kwargs['plot']) axScatter.set_xlim( (min_x, max_x) ) axScatter.set_ylim( (min_y, max_y) ) # This sets the max-Coverage so that all libraries + sum are at the same scale axScatter.grid(True, which="major", lw=2., color=WHITE, linestyle='-') axScatter.set_axisbelow(True) axScatter.xaxis.labelpad = 20 axScatter.yaxis.labelpad = 20 axScatter.yaxis.get_major_ticks()[0].label1.set_visible(False) axScatter.tick_params(axis='both', which='both', direction='out') return axScatter
def parseCas(infile, order_of_blobs): if not isfile(infile): BtLog.error('0', infile) seqs_total, reads_total, reads_mapped = checkCas(infile) progress_unit = int(len(order_of_blobs)/100) cas_line_re = re.compile(r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})") command = "clc_mapping_info -n " + infile cov_dict = {} read_cov_dict = {} seqs_parsed = 0 if (runCmd(command=command)): for line in runCmd(command=command): cas_line_match = cas_line_re.search(line) if cas_line_match: idx = int(cas_line_match.group(1)) - 1 # -1 because index of contig list starts with zero try: name = order_of_blobs[idx] reads = int(cas_line_match.group(3)) cov = float(cas_line_match.group(6)) cov_dict[name] = cov read_cov_dict[name] = reads seqs_parsed += 1 except: pass BtLog.progress(seqs_parsed, progress_unit, seqs_total) return cov_dict, reads_total, reads_mapped, read_cov_dict
def parseCovFromHeader(fasta_type, header): ''' Returns the coverage from the header of a FASTA sequence depending on the assembly type ''' ASSEMBLY_TYPES = [None, 'spades', 'velvet', 'platanus'] if not fasta_type in ASSEMBLY_TYPES: BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:])) if fasta_type == 'spades': spades_match_re = re.compile(r"_cov_(\d+\.*\d*)") cov = re.findall(r"_cov_(\d+\.*\d*)", header) return float(spades_match_re.findall(header)[0]) elif fasta_type == 'velvet': return float(header.split("_")[-1]) #elif fasta_type == 'abyss' or fasta_type == 'soap': # temp = header.split(" ") # return float(temp[2]/(temp[1]+1-75)) elif fasta_type == 'platanus': temp = header.rstrip("\n").split("_") if len(temp) >= 3: return float(temp[2].replace("cov", "")) # scaffold/scaffoldBubble/contig else: return float(temp[1].replace("cov", "")) # gapClosed else: pass
def parseJson(infile): '''http://artem.krylysov.com/blog/2015/09/29/benchmark-python-json-libraries/''' if not isfile(infile): BtLog.error('0', infile) import time start = time.time() json_parser = '' with open(infile, 'r') as fh: print BtLog.status_d['15'] json_string = fh.read() try: import ujson as json # fastest json_parser = 'ujson' print BtLog.status_d['16'] % json_parser except ImportError: try: import simplejson as json # fast json_parser = 'simplejson' except ImportError: import json # default json_parser = 'json' print BtLog.status_d['17'] % json_parser try: obj = json.loads(json_string.decode("ascii")) except ValueError: BtLog.error('37', infile, "BlobDB") data = byteify(obj) print BtLog.status_d['20'] % (time.time() - start) return data
def main(): #main_dir = dirname(__file__) args = docopt(__doc__) fasta_f = args['--infile'] fasta_type = args['--type'] sam_fs = args['--sam'] bam_fs = args['--bam'] cov_fs = args['--cov'] cas_fs = args['--cas'] hit_fs = args['--hitsfile'] prefix = args['--out'] nodesDB_f = args['--db'] names_f = args['--names'] nodes_f = args['--nodes'] taxrules = args['--taxrule'] min_bitscore_diff = float(args['--min_diff']) tax_collision_random = args['--tax_collision_random'] title = args['--title'] # outfile out_f = BtIO.getOutFile("blobDB", prefix, "json") if not (title): title = out_f # coverage if not (fasta_type) and not bam_fs and not sam_fs and not cov_fs and not cas_fs: BtLog.error('1') cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \ [BtCore.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \ [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \ [BtCore.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)] # taxonomy hit_libs = [BtCore.HitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs)] # Create BlobDB object blobDb = BtCore.BlobDb(title) blobDb.version = blobtools.__version__ # Parse FASTA blobDb.parseFasta(fasta_f, fasta_type) # Parse nodesDB OR names.dmp, nodes.dmp nodesDB_default = join(blobtools.DATADIR, "nodesDB.txt") nodesDB, nodesDB_f = BtIO.parseNodesDB(nodes=nodes_f, names=names_f, nodesDB=nodesDB_f, nodesDBdefault=nodesDB_default) blobDb.nodesDB_f = nodesDB_f # Parse similarity hits if (hit_libs): blobDb.parseHits(hit_libs) blobDb.computeTaxonomy(taxrules, nodesDB, min_bitscore_diff, tax_collision_random) else: print BtLog.warn_d['0'] # Parse coverage blobDb.parseCoverage(covLibObjs=cov_libs, no_base_cov=None) # Generating BlobDB and writing to file print BtLog.status_d['7'] % out_f BtIO.writeJson(blobDb.dump(), out_f)
def main(): args = docopt(__doc__) bam_f = args['--bam'] include_f = args['--include'] exclude_f = args['--exclude'] out_prefix = args['--out'] gzip = args['--gzip'] do_sort = args['--sort'] keep_sorted = args['--keep'] sort_threads = int(args['--threads']) print BtLog.status_d['22'] % bam_f out_f = BtIO.getOutFile(bam_f, out_prefix, None) if include_f and exclude_f: print BtLog.error('43') elif include_f: sequence_list = BtIO.parseList(include_f) BtIO.parseBamForFilter(bam_f, out_f, sequence_list, None, gzip, do_sort, keep_sorted, sort_threads) elif exclude_f: sequence_list = BtIO.parseList(exclude_f) BtIO.parseBamForFilter(bam_f, out_f, None, sequence_list, gzip, do_sort, keep_sorted, sort_threads) else: BtIO.parseBamForFilter(bam_f, out_f, None, None, gzip, do_sort, keep_sorted, sort_threads)
def subselect_cov_libs(self, cov_lib_dict, cov_lib_selection): selected_cov_libs = [] cov_lib_selection_error = 0 if (cov_lib_selection): if cov_lib_selection == 'covsum': selected_cov_libs.append('covsum') elif "," in cov_lib_selection: selected_cov_libs = cov_lib_selection.split(",") if not set(selected_cov_libs).issubset(set(cov_lib_dict.keys())): cov_lib_selection_error = 1 else: selected_cov_libs.append(cov_lib_selection) if not cov_lib_selection in cov_lib_dict: cov_lib_selection_error = 1 else: selected_cov_libs = cov_lib_dict.keys() if cov_lib_selection_error: covlib_string = [] for covlib in cov_lib_dict: cov_lib_f = cov_lib_dict[covlib]['f'] if not cov_lib_f: cov_lib_f = "sum of coverages from all covlibs" covlib_string.append("\t\t%s : %s" % (covlib, cov_lib_f)) BtLog.error('33', "\n".join(covlib_string)) return selected_cov_libs
def parseFasta(self, fasta_f, fasta_type): print BtLog.status_d["1"] % ("FASTA", fasta_f) self.assembly_f = abspath(fasta_f) if fasta_type: # Set up CovLibObj for coverage in assembly header self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type, fasta_f) for name, seq in BtIO.readFasta(fasta_f): blObj = BlObj(name, seq) if not blObj.name in self.dict_of_blobs: self.seqs += 1 self.length += blObj.length self.n_count += blObj.n_count if fasta_type: cov = BtIO.parseCovFromHeader(fasta_type, blObj.name) self.covLibs[fasta_type].cov_sum += cov blObj.addCov(fasta_type, cov) self.order_of_blobs.append(blObj.name) self.dict_of_blobs[blObj.name] = blObj else: BtLog.error("5", blObj.name) if self.seqs == 0 or self.length == 0: BtLog.error("1")
def parseCas(infile, order_of_blobs): if not isfile(infile): BtLog.error('0', infile) seqs_total, reads_total, reads_mapped = checkCas(infile) progress_unit = int(len(order_of_blobs) / 100) cas_line_re = re.compile( r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})") command = "clc_mapping_info -n " + infile cov_dict = {} read_cov_dict = {} seqs_parsed = 0 if (runCmd(command=command)): for line in runCmd(command=command): cas_line_match = cas_line_re.search(line) if cas_line_match: idx = int(cas_line_match.group( 1)) - 1 # -1 because index of contig list starts with zero try: name = order_of_blobs[idx] reads = int(cas_line_match.group(3)) cov = float(cas_line_match.group(6)) cov_dict[name] = cov read_cov_dict[name] = reads seqs_parsed += 1 except: pass BtLog.progress(seqs_parsed, progress_unit, seqs_total) return cov_dict, reads_total, reads_mapped, read_cov_dict
def parseFasta(self, fasta_f, fasta_type): print BtLog.status_d['1'] % ('FASTA', fasta_f) self.assembly_f = abspath(fasta_f) if (fasta_type): # Set up CovLibObj for coverage in assembly header self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type, fasta_f) for name, seq in BtIO.readFasta(fasta_f): blObj = BlObj(name, seq) if not blObj.name in self.dict_of_blobs: self.seqs += 1 self.length += blObj.length self.n_count += blObj.n_count if (fasta_type): cov = BtIO.parseCovFromHeader(fasta_type, blObj.name) self.covLibs[fasta_type].cov_sum += cov blObj.addCov(fasta_type, cov) self.order_of_blobs.append(blObj.name) self.dict_of_blobs[blObj.name] = blObj else: BtLog.error('5', blObj.name) if self.seqs == 0 or self.length == 0: BtLog.error('1')
def parseSet(infile): if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: items = set() for l in fh: items.add(l.rstrip("\n").lstrip(">")) return items
def parseList(infile): if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: items = [] for l in fh: items.append(l.rstrip("\n")) return items
def parseColours(infile): items = {} if infile: if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: for l in fh: temp = l.rstrip("\n").split(",") items[temp[0]] = temp[1] return items
def parseBam(infile, set_of_blobs, no_base_cov_flag): ''' checkBam returns reads_total and reads_mapped base_cov_dict is list of coverages for each contigs, since list appending should be faster ''' if not isfile(infile): BtLog.error('0', infile) reads_total, reads_mapped = checkBam(infile) progress_unit = int(reads_mapped / 1000) base_cov_dict = {blob: [] for blob in set_of_blobs} #base_cov_dict = {blob : 0 for blob in set_of_blobs} read_cov_dict = {blob: 0 for blob in set_of_blobs} cigar_match_re = re.compile( r"(\d+)M|X|=") # only gets digits before M,X,='s # execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment) command = "samtools view -F 1024 -F 4 -F 256 " + infile seen_reads = 0 #import time #start = time.time() if not (no_base_cov_flag): for line in runCmd(command=command): seen_reads += 1 match = line.split() try: base_cov_dict[match[2]].append( sum([ int(matching) for matching in cigar_match_re.findall(match[5]) ])) #base_cov_dict[match[2]] += sum([int(matching) for matching in cigar_match_re.findall(match[5])]) read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) BtLog.progress(seen_reads, progress_unit, reads_mapped) else: for line in runCmd(command=command): seen_reads += 1 match = line.split() try: read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) BtLog.progress(seen_reads, progress_unit, reads_mapped) if not int(reads_mapped) == int(seen_reads): print BtLog.warn_d['3'] % (reads_mapped, seen_reads) base_cov_dict = { seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items() } #end = time.time() #print (end-start) return base_cov_dict, reads_total, reads_mapped, read_cov_dict
def parseCov(infile, set_of_blobs): if not isfile(infile): BtLog.error('0', infile) old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)") base_cov_dict = {} cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)") reads_total = 0 reads_mapped = 0 reads_unmapped = 0 read_cov_dict = {} seqs_parsed = 0 progress_unit = 1 old_format = 1 with open(infile) as fh: for line in fh: if line.startswith("#"): old_format = 0 if old_format == 0: if line.startswith('#'): if line.startswith("## Total Reads"): reads_total = int(line.split(" = ")[1]) elif line.startswith("## Mapped Reads"): reads_mapped = int(line.split(" = ")[1]) elif line.startswith("## Unmapped Reads"): reads_unmapped = int(line.split(" = ")[1]) else: pass else: match = cov_line_re.search(line) if match: seqs_parsed += 1 name, read_cov, base_cov = match.group(1), int( match.group(2)), float(match.group(3)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name, infile) else: read_cov_dict[name] = read_cov base_cov_dict[name] = base_cov else: match = old_cov_line_re.search(line) if match: seqs_parsed += 1 name, base_cov = match.group(1), float(match.group(2)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name) else: base_cov_dict[name] = base_cov BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs)) #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs)) return base_cov_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict
def parseDict(infile, key, value): items = {} if infile: if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: items = {} k_idx = int(key) v_idx = int(value) for l in fh: temp = l.rstrip("\n").split() items[temp[k_idx]] = temp[v_idx] return items
def readFasta(infile): if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: header, seqs = '', [] for l in fh: if l[0] == '>': if header: yield header, ''.join(seqs) header, seqs = l[1:-1].split()[0], [] # Header is split at first whitespace else: seqs.append(l[:-1]) yield header, ''.join(seqs)
def parseCatColour(infile): catcolour_dict = {} if infile: if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: for l in fh: try: seq_name, category = l.rstrip("\n").split(",") catcolour_dict[seq_name] = category except: BtLog.error('23', infile) return catcolour_dict
def parseCov(infile, set_of_blobs): if not isfile(infile): BtLog.error('0', infile) old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)") base_cov_dict = {} cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)") reads_total = 0 reads_mapped = 0 reads_unmapped = 0 read_cov_dict = {} seqs_parsed = 0 progress_unit = 1 old_format = 1 with open(infile) as fh: for line in fh: if line.startswith("#"): old_format = 0 if old_format == 0: if line.startswith('#'): if line.startswith("## Total Reads"): reads_total = int(line.split(" = ")[1]) elif line.startswith("## Mapped Reads"): reads_mapped = int(line.split(" = ")[1]) elif line.startswith("## Unmapped Reads"): reads_unmapped = int(line.split(" = ")[1]) else: pass else: match = cov_line_re.search(line) if match: seqs_parsed += 1 name, read_cov, base_cov = match.group(1), int(match.group(2)), float(match.group(3)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name, infile) else: read_cov_dict[name] = read_cov base_cov_dict[name] = base_cov else: match = old_cov_line_re.search(line) if match: seqs_parsed += 1 name, base_cov = match.group(1), float(match.group(2)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name) else: base_cov_dict[name] = base_cov BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs)) #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs)) return base_cov_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict
def plotBar(self, cov_lib, out_f): fig, ax_main, ax_group, x_pos_main, x_pos_group = self.setupPlot('readcov') ax_main_data = {'labels' : [], 'values' : [], 'colours' : [] } ax_group_data = {'labels' : [], 'values' : [], 'colours' : [] } reads_total = self.cov_libs_total_reads_dict[cov_lib] reads_mapped = self.stats['all']['reads_mapped'][cov_lib] reads_unmapped = reads_total - self.stats['all']['reads_mapped'][cov_lib] ax_main_data['labels'].append('Unmapped (assembly)') ax_main_data['values'].append(reads_unmapped/reads_total) ax_main_data['colours'].append(DGREY) ax_main_data['labels'].append('Mapped (assembly)') ax_main_data['values'].append(reads_mapped/reads_total) ax_main_data['colours'].append(DGREY) if (self.refcov_dict): if cov_lib in self.refcov_dict: reads_total_ref = self.refcov_dict[cov_lib]['reads_total'] reads_mapped_ref = self.refcov_dict[cov_lib]['reads_mapped'] reads_unmapped_ref = reads_total_ref - reads_mapped_ref ax_main_data['labels'].append('Unmapped (ref)') ax_main_data['values'].append(reads_unmapped_ref/reads_total_ref) ax_main_data['colours'].append(DGREY) ax_main_data['labels'].append('Mapped (ref)') ax_main_data['values'].append(reads_mapped_ref/reads_total_ref) ax_main_data['colours'].append(DGREY) else: BtLog.error('40', cov_lib) # mapped plotted groups for group in self.plot_order: ax_group_data['labels'].append(group) ax_group_data['values'].append(self.stats[group]['reads_mapped_perc'][cov_lib]) ax_group_data['colours'].append(self.colours[group]) rect_group = ax_group.bar(x_pos_group, ax_group_data['values'], width = 0.5, tick_label=ax_group_data['labels'], align='center', color = ax_group_data['colours']) for rect_g in rect_group: height_g = float(rect_g.get_height()) ax_group.text(rect_g.get_x() + rect_g.get_width()/2., 0.005 + height_g, '{:.2f}%'.format(height_g*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE) rect_main = ax_main.bar(x_pos_main, ax_main_data['values'], width = 0.5, tick_label=ax_main_data['labels'], align='center', color = ax_main_data['colours']) for rect_m in rect_main: height_m = float(rect_m.get_height()) ax_main.text(rect_m.get_x() + rect_m.get_width()/2., 0.005 + height_m, '{:.2f}%'.format(height_m*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE) ax_main.set_xticklabels(ax_main_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE) ax_group.set_xticklabels(ax_group_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE) #figsuptitle = fig.suptitle(out_f, verticalalignment='top') out_f = "%s.read_cov.%s" % (out_f, cov_lib) print BtLog.status_d['8'] % "%s.%s" % (out_f, self.format) fig.tight_layout() #fig.savefig("%s.%s" % (out_f, self.format), format=self.format, bbox_extra_artists=(figsuptitle,)) fig.savefig("%s.%s" % (out_f, self.format), format=self.format) plt.close(fig)
def parseReferenceCov(infile): refcov_dict = {} if infile: if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: for l in fh: try: cov_lib, reads_total_ref, reads_mapped_ref = l.split(",") refcov_dict[cov_lib] = {'reads_total' : int(reads_total_ref), 'reads_mapped' : int(reads_mapped_ref)} except: BtLog.error('21', infile) return refcov_dict
def parseCmdLabels(labels): label_d = {} name, groups = '', '' if labels: try: for label in labels: name, groups = str(label).split("=") if "," in groups: for group in groups.split(","): label_d[group] = name else: label_d[groups] = name except: BtLog.error('17', labels) return label_d
def parseSam(infile, set_of_blobs, no_base_cov_flag): if not isfile(infile): BtLog.error('0', infile) base_cov_dict = {blob: [] for blob in set_of_blobs} read_cov_dict = {blob: 0 for blob in set_of_blobs} cigar_match_re = re.compile( r"(\d+)M|X|=") # only gets digits before M,X,='s reads_total = 0 reads_mapped = 0 if not (no_base_cov_flag): with open(infile) as fh: for line in fh: if line.startswith("@"): pass else: reads_total += 1 match = line.split() if not match[2] == '*': reads_mapped += 1 try: base_cov_dict[match[2]].append( sum([ int(matching) for matching in cigar_match_re.findall(match[5]) ])) read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) else: with open(infile) as fh: for line in fh: if line.startswith("@"): pass else: reads_total += 1 match = line.split() if not match[2] == '*': reads_mapped += 1 try: read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) base_cov_dict = { seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items() } return base_cov_dict, reads_total, reads_mapped, read_cov_dict
def parseBam(infile, set_of_blobs, no_base_cov_flag): ''' checkBam returns reads_total and reads_mapped base_cov_dict is list of coverages for each contigs, since list appending should be faster ''' if not isfile(infile): BtLog.error('0', infile) reads_total, reads_mapped = checkBam(infile) progress_unit = int(reads_mapped/1000) base_cov_dict = {blob : [] for blob in set_of_blobs} #base_cov_dict = {blob : 0 for blob in set_of_blobs} read_cov_dict = {blob : 0 for blob in set_of_blobs} cigar_match_re = re.compile(r"(\d+)M|X|=") # only gets digits before M,X,='s # execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment) command = "samtools view -F 1024 -F 4 -F 256 " + infile seen_reads = 0 #import time #start = time.time() if not (no_base_cov_flag): for line in runCmd(command=command): seen_reads += 1 match = line.split() try: base_cov_dict[match[2]].append(sum([int(matching) for matching in cigar_match_re.findall(match[5])])) #base_cov_dict[match[2]] += sum([int(matching) for matching in cigar_match_re.findall(match[5])]) read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) BtLog.progress(seen_reads, progress_unit, reads_mapped) else: for line in runCmd(command=command): seen_reads += 1 match = line.split() try: read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) BtLog.progress(seen_reads, progress_unit, reads_mapped) if not int(reads_mapped) == int(seen_reads): print BtLog.warn_d['3'] % (reads_mapped, seen_reads) base_cov_dict = {seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items()} #end = time.time() #print (end-start) return base_cov_dict, reads_total, reads_mapped, read_cov_dict
def checkCas(infile): print BtLog.status_d['12'] if not isfile(infile): BtLog.error('0', infile) if not (which('clc_mapping_info')): BtLog.error('20') seqs_total_re = re.compile(r"\s+Contigs\s+(\d+)") reads_total_re = re.compile(r"\s+Reads\s+(\d+)") reads_mapping_re = re.compile(r"\s+Mapped reads\s+(\d+)\s+(\d+.\d+)\s+\%") seqs_total, reads_total, reads_mapping, mapping_rate = 0, 0, 0, 0.0 output = '' command = "clc_mapping_info -s " + infile for line in runCmd(command=command): output += line seqs_total = int(seqs_total_re.search(output).group(1)) reads_mapped = int(reads_mapping_re.search(output).group(1)) reads_total = int(reads_total_re.search(output).group(1)) print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total)) return seqs_total, reads_total, reads_mapped
def main(): args = docopt(__doc__) fasta_f = args['--infile'] bam_fs = args['--bam'] cas_fs = args['--cas'] sam_fs = args['--sam'] prefix = args['--output'] no_base_cov_flag = args['--no_base_cov'] # Make covLibs cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \ [BtCore.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \ [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] if not (cov_libs): BtLog.error('31') blobDb = BtCore.BlobDb('cov') blobDb.version = blobtools.__version__ blobDb.parseFasta(fasta_f, None) blobDb.parseCoverage(covLibObjs=cov_libs, no_base_cov=no_base_cov_flag)
def checkBam(infile): print BtLog.status_d['10'] if not isfile(infile): BtLog.error('0', infile) if not which('samtools'): BtLog.error('7') reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped") reads_secondary_re = re.compile(r"(\d+)\s\+\s\d+\ssecondary") reads_supplementary_re = re.compile(r"(\d+)\s\+\s\d+\ssupplementary") reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total") reads_total, reads_mapped = 0, 0 output = '' command = "samtools flagstat " + infile for line in runCmd(command=command): output += line reads_mapped = int(reads_mapped_re.search(output).group(1)) reads_secondary = int(reads_secondary_re.search(output).group(1)) reads_supplementary = int(reads_supplementary_re.search(output).group(1)) reads_mapped = reads_mapped - reads_secondary - reads_secondary reads_total = int(reads_total_re.search(output).group( 1)) - reads_secondary - reads_supplementary # check whether there are reads in BAM if not reads_total or not reads_mapped: BtLog.error('29' % infile) print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), \ '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total)) return reads_total, reads_mapped
def readTax(infile, set_of_blobs): ''' If more fields need to be parsed: - change hit_line_re - catch matches in variables - add as key-value pairs to hitDict ''' if not isfile(infile): BtLog.error('0', infile) hit_line_re = re.compile(r"^(\S+)\s+(\d+)[\;?\d+]*\s+(\d+\.*\d*)" ) # TEST TEST , if not split it afterwards with open(infile) as fh: for line in fh: match = hit_line_re.search(line) if match: hitDict = { 'name': match.group(1), 'taxId': match.group( 2 ), # string because if int, conversion is a nightmare ... 'score': float(match.group(3)) } if hitDict['name'] not in set_of_blobs: BtLog.error('19', hitDict['name'], infile) if hitDict['taxId'] == 'N/A': BtLog.error('22', infile) yield hitDict
def checkBam(infile): print BtLog.status_d['10'] if not isfile(infile): BtLog.error('0', infile) if not which('samtools'): BtLog.error('7') reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped") reads_secondary_re = re.compile(r"(\d+)\s\+\s\d+\ssecondary") reads_supplementary_re = re.compile(r"(\d+)\s\+\s\d+\ssupplementary") reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total") reads_total, reads_mapped = 0, 0 output = '' command = "samtools flagstat " + infile for line in runCmd(command=command): output += line reads_mapped = int(reads_mapped_re.search(output).group(1)) reads_secondary = int(reads_secondary_re.search(output).group(1)) reads_supplementary = int(reads_supplementary_re.search(output).group(1)) reads_mapped = reads_mapped - reads_secondary - reads_secondary reads_total = int(reads_total_re.search(output).group(1)) - reads_secondary - reads_supplementary # check whether there are reads in BAM if not reads_total or not reads_mapped: BtLog.error('29' % infile) print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), \ '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total)) return reads_total, reads_mapped
def parseSam(infile, set_of_blobs, no_base_cov_flag): if not isfile(infile): BtLog.error('0', infile) base_cov_dict = {blob : [] for blob in set_of_blobs} read_cov_dict = {blob : 0 for blob in set_of_blobs} cigar_match_re = re.compile(r"(\d+)M|X|=") # only gets digits before M,X,='s reads_total = 0 reads_mapped = 0 if not (no_base_cov_flag): with open(infile) as fh: for line in fh: if line.startswith("@"): pass else: reads_total += 1 match = line.split() if not match[2] == '*': reads_mapped += 1 try: base_cov_dict[match[2]].append(sum([int(matching) for matching in cigar_match_re.findall(match[5])])) read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) else: with open(infile) as fh: for line in fh: if line.startswith("@"): pass else: reads_total += 1 match = line.split() if not match[2] == '*': reads_mapped += 1 try: read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) base_cov_dict = {seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items()} return base_cov_dict, reads_total, reads_mapped, read_cov_dict
def readTax(infile, set_of_blobs): ''' If more fields need to be parsed: - change hit_line_re - catch matches in variables - add as key-value pairs to hitDict ''' if not isfile(infile): BtLog.error('0', infile) hit_line_re = re.compile(r"^(\S+)\s+(\d+)[\;?\d+]*\s+(\d+\.*\d*)") # TEST TEST , if not split it afterwards with open(infile) as fh: for line in fh: match = hit_line_re.search(line) if match: hitDict = { 'name' : match.group(1), 'taxId' : match.group(2), # string because if int, conversion is a nightmare ... 'score' : float(match.group(3)) } if hitDict['name'] not in set_of_blobs: BtLog.error('19', hitDict['name'], infile) if hitDict['taxId'] == 'N/A': BtLog.error('22', infile) yield hitDict
def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict): data_dict = {} read_cov_dict = {} max_cov = 0.0 min_cov = 1000.0 cov_lib_dict = self.covLibs cov_lib_names_l = self.covLibs.keys() # does not include cov_sum if len(cov_lib_names_l) > 1: # more than one cov_lib, cov_sum_lib has to be created cov_lib_dict['covsum'] = CovLibObj('covsum', 'covsum', 'Sum of cov in %s' % basename(self.title)).__dict__ # ugly cov_lib_dict['covsum']['reads_total'] = sum([self.covLibs[x]['reads_total'] for x in self.covLibs]) cov_lib_dict['covsum']['reads_mapped'] = sum([self.covLibs[x]['reads_mapped'] for x in self.covLibs]) cov_lib_dict['covsum']['cov_sum'] = sum([self.covLibs[x]['cov_sum'] for x in self.covLibs]) cov_lib_dict['covsum']['mean_cov'] = cov_lib_dict['covsum']['cov_sum']/self.seqs for blob in self.dict_of_blobs.values(): name, gc, length, group = blob['name'], blob['gc'], blob['length'], '' if (catcolour_dict): # annotation with categories specified in catcolour group = str(catcolour_dict[name]) elif (c_index): # annotation with c_index instead of taxonomic group if taxrule not in self.taxrules: BtLog.error('11', taxrule, self.taxrules) else: group = str(blob['taxonomy'][taxrule][rank]['c_index']) else: # annotation with taxonomic group if not (taxrule) or taxrule not in self.taxrules: BtLog.warn_d['9'] % (taxrule, self.taxrules) if taxrule in blob['taxonomy']: group = str(blob['taxonomy'][taxrule][rank]['tax']) if not group in data_dict: data_dict[group] = { 'name' : [], 'length' : [], 'gc' : [], 'covs' : {covLib : [] for covLib in cov_lib_dict.keys()}, # includes cov_sum if it exists 'reads_mapped' : {covLib : 0 for covLib in cov_lib_dict.keys()}, # includes cov_sum if it exists 'count' : 0, 'count_hidden' : 0, 'count_visible' : 0, 'span': 0, 'span_hidden' : 0, 'span_visible' : 0, } data_dict[group]['count'] = data_dict[group].get('count', 0) + 1 data_dict[group]['span'] = data_dict[group].get('span', 0) + int(length) if ((hide_nohits) and group == 'no-hit') or length < min_length: # hidden data_dict[group]['count_hidden'] = data_dict[group].get('count_hidden', 0) + 1 data_dict[group]['span_hidden'] = data_dict[group].get('span_hidden', 0) + int(length) else: # visible data_dict[group]['count_visible'] = data_dict[group].get('count_visible', 0) + 1 data_dict[group]['span_visible'] = data_dict[group].get('span_visible', 0) + int(length) data_dict[group]['name'].append(name) data_dict[group]['length'].append(length) data_dict[group]['gc'].append(gc) cov_sum = 0.0 reads_mapped_sum = 0 for cov_lib in sorted(cov_lib_names_l): cov = float(blob['covs'][cov_lib]) if cov < 0.02: cov = 0.02 if cov < min_cov: min_cov = cov # increase max_cov if cov > max_cov: max_cov = cov # add cov of blob to group data_dict[group]['covs'][cov_lib].append(cov) cov_sum += cov # add readcov if cov_lib in blob['read_cov']: reads_mapped = blob['read_cov'][cov_lib] data_dict[group]['reads_mapped'][cov_lib] += reads_mapped reads_mapped_sum += reads_mapped if len(cov_lib_names_l) > 1: if cov_sum < 0.02 : cov_sum = 0.02 data_dict[group]['covs']['covsum'].append(cov_sum) if cov_sum > max_cov: max_cov = cov_sum if (reads_mapped_sum): data_dict[group]['reads_mapped']['covsum'] += reads_mapped_sum return data_dict, min_cov, max_cov, cov_lib_dict
def plotScatter(self, cov_lib, info_flag, out_f): fig, axScatter, axHistx, axHisty, axLegend, top_bins, right_bins = self.setupPlot(self.plot) # empty handles for big legend legend_handles = [] legend_labels = [] # marker size scaled by biggest blob (size in points^2) max_length = max(array(self.stats['all']['length'])) # length of biggest blob max_marker_size = 12500 # marker size for biggest blob, i.e. area of 12500^2 pixel for idx, group in enumerate(self.plot_order): idx += 1 lw, alpha = 0.5, 0.8 if group == 'no-hit': alpha = 0.5 group_length_array = array(self.stats[group]['length']) if len(group_length_array) > 0 and group not in self.exclude_groups: colour = self.colours[group] group_x_array = '' group_y_array = '' if self.plot == 'blobplot': group_x_array = array(self.stats[group]['gc']) group_y_array = array(self.stats[group]['covs'][cov_lib]) elif self.plot == 'covplot': group_x_array = array(self.stats[group]['covs'][cov_lib]) group_y_array = array([self.cov_y_dict.get(name, 0.02) for name in self.stats[group]['name']]) else: BtLog.error('34', self.plot) marker_size_array = [] if (self.ignore_contig_length): # no scaling if group == "no-hit": s = 20 else: s = 100 marker_size_array = [s for length in group_length_array] else: # scaling by max_length marker_size_array = [(length/max_length)*max_marker_size for length in group_length_array] # generate label for legend group_span_in_mb = round(self.stats[group]['span_visible']/1000000, 2) group_number_of_seqs = self.stats[group]['count_visible'] group_n50 = self.stats[group]['n50'] fmt_seqs = "{:,}".format(group_number_of_seqs) fmt_span = "{:,}".format(group_span_in_mb) fmt_n50 = "{:,}".format(group_n50) label = "%s (%s;%sMB;%snt)" % (group, fmt_seqs, fmt_span, fmt_n50) if (info_flag): print BtLog.info_d['0'] % (group, fmt_seqs, fmt_span, fmt_n50) legend_handles.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markerfacecolor=colour)) legend_labels.append(label) weights_array = None if self.hist_type == "span": weights_array = group_length_array/1000 axHistx.hist(group_x_array, weights=weights_array, color = colour, bins = top_bins, histtype='step', lw = 3) axHisty.hist(group_y_array, weights=weights_array, color = colour, bins = right_bins, histtype='step', orientation='horizontal', lw = 3) axScatter.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=BLACK, label=label) axLegend.axis('off') if (self.multiplot): fig_m, axScatter_m, axHistx_m, axHisty_m, axLegend_m, top_bins, right_bins = self.setupPlot(self.plot) legend_handles_m = [] legend_labels_m = [] legend_handles_m.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markerfacecolor=colour)) legend_labels_m.append(label) axHistx_m.hist(group_x_array, weights=weights_array, color = colour, bins = top_bins, histtype='step', lw = 3) axHisty_m.hist(group_y_array, weights=weights_array, color = colour, bins = right_bins, histtype='step', orientation='horizontal', lw = 3) axScatter_m.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=BLACK, label=label) axLegend_m.axis('off') axLegend_m.legend(legend_handles_m, legend_labels_m, loc=6, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True) plot_ref_legend(axScatter_m, max_length, max_marker_size, self.ignore_contig_length) m_out_f = "%s.%s.%s.%s" % (out_f, cov_lib, idx, group.replace("/", "_").replace(" ", "_")) fig_m = plot_legend(fig_m, axLegend_m, m_out_f, self.legend_flag, self.format, self.cumulative_flag) print BtLog.status_d['8'] % "%s.%s" % (m_out_f, self.format) fig_m.savefig("%s.%s" % (m_out_f, self.format), format=self.format) plt.close(fig_m) elif (self.cumulative_flag): axLegend.legend(legend_handles, legend_labels, loc=6, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True) plot_ref_legend(axScatter, max_length, max_marker_size, self.ignore_contig_length) m_out_f = "%s.%s.%s.%s" % (out_f, cov_lib, idx, group.replace("/", "_").replace(" ", "_")) fig.add_axes(axLegend) fig = plot_legend(fig, axLegend, m_out_f, self.legend_flag, self.format, self.cumulative_flag) if not (self.no_title): fig.suptitle(out_f, fontsize=35, verticalalignment='top') print BtLog.status_d['8'] % "%s.%s" % (m_out_f, self.format) fig.savefig("%s.%s" % (m_out_f, self.format), format=self.format) else: pass plot_ref_legend(axScatter, max_length, max_marker_size, self.ignore_contig_length) axLegend.legend(legend_handles, legend_labels, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True, loc=6 ) out_f = "%s.%s" % (out_f, cov_lib) fig.add_axes(axLegend) fig = plot_legend(fig, axLegend, out_f, self.legend_flag, self.format, self.cumulative_flag) if not (self.no_title): fig.suptitle(out_f, fontsize=35, verticalalignment='top') print BtLog.status_d['8'] % "%s.%s" % (out_f, self.format) fig.savefig("%s.%s" % (out_f, self.format), format=self.format) plt.close(fig)
def parseNodesDB(**kwargs): ''' Parsing names.dmp and nodes.dmp into the 'nodes_db' dict of dicts that gets JSON'ed into blobtools/data/nodes_db.json if this file does not exist. Nodes_db.json is used if neither "--names" and "--nodes" nor "--db" is specified. ''' nodesDB = {} names_f = kwargs['names'] nodes_f = kwargs['nodes'] nodesDB_f = kwargs['nodesDB'] nodesDB_default = kwargs['nodesDBdefault'] if (nodes_f and names_f): if not isfile(names_f): BtLog.error('0', names_f) if not isfile(nodes_f): BtLog.error('0', nodes_f) print BtLog.status_d['3'] % (nodes_f, names_f) try: nodesDB = readNamesNodes(names_f, nodes_f) except: BtLog.error('3', nodes_f, names_f) elif (nodesDB_f): if not isfile(nodesDB_f): BtLog.error('0', nodesDB_f) print BtLog.status_d['4'] % (nodesDB_f) try: nodesDB = readNodesDB(nodesDB_f) except: BtLog.error('27', nodesDB_f) elif (nodesDB_default): if not isfile(nodesDB_default): BtLog.error('28') print BtLog.status_d['4'] % (nodesDB_default) try: nodesDB = readNodesDB(nodesDB_default) except: BtLog.error('27', nodesDB_default) nodesDB_f = nodesDB_default # Write nodesDB if not available if not isfile(nodesDB_default): writeNodesDB(nodesDB, nodesDB_default) return nodesDB, nodesDB_f
def plotScatter(self, cov_lib, info_flag, out_f): fig, axScatter, axHistx, axHisty, axLegend, top_bins, right_bins = self.setupPlot(self.plot) # empty handles for big legend legend_handles = [] legend_labels = [] # marker size scaled by biggest blob (size in points^2) max_length = max(array(self.stats['all']['length'])) # length of biggest blob max_marker_size = 12500 # marker size for biggest blob, i.e. area of 12500^2 pixel for idx, group in enumerate(self.plot_order): idx += 1 lw, alpha = 0.5, 0.8 if group == 'no-hit': alpha = 0.5 group_length_array = array(self.stats[group]['length']) if len(group_length_array) > 0 and group not in self.exclude_groups: colour = self.colours[group] group_x_array = '' group_y_array = '' if self.plot == 'blobplot': group_x_array = array(self.stats[group]['gc']) group_y_array = array(self.stats[group]['covs'][cov_lib]) elif self.plot == 'covplot': group_x_array = array(self.stats[group]['covs'][cov_lib]) group_y_array = array([self.cov_y_dict.get(name, 0.02) for name in self.stats[group]['name']]) else: BtLog.error('34', self.plot) marker_size_array = [] if (self.ignore_contig_length): # no scaling if group == "no-hit": s = 20 else: s = 100 marker_size_array = [s for length in group_length_array] else: # scaling by max_length marker_size_array = [(length/max_length)*max_marker_size for length in group_length_array] # generate label for legend group_span_in_mb = round(self.stats[group]['span_visible']/1000000, 2) group_number_of_seqs = self.stats[group]['count_visible'] group_n50 = self.stats[group]['n50'] fmt_seqs = "{:,}".format(group_number_of_seqs) fmt_span = "{:,}".format(group_span_in_mb) fmt_n50 = "{:,}".format(group_n50) label = "%s (%s;%sMB;%snt)" % (group, fmt_seqs, fmt_span, fmt_n50) if (info_flag): print BtLog.info_d['0'] % (group, fmt_seqs, fmt_span, fmt_n50) legend_handles.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markeredgecolor=BLACK, markerfacecolor=colour)) legend_labels.append(label) weights_array = None if self.hist_type == "span": weights_array = group_length_array/1000 axHistx.hist(group_x_array, weights=weights_array, color = colour, bins = top_bins, histtype='step', lw = 3) axHisty.hist(group_y_array, weights=weights_array, color = colour, bins = right_bins, histtype='step', orientation='horizontal', lw = 3) axScatter.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=BLACK, label=label) axLegend.axis('off') if (self.multiplot): fig_m, axScatter_m, axHistx_m, axHisty_m, axLegend_m, top_bins, right_bins = self.setupPlot(self.plot) legend_handles_m = [] legend_labels_m = [] legend_handles_m.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markeredgecolor=BLACK, markerfacecolor=colour)) legend_labels_m.append(label) axHistx_m.hist(group_x_array, weights=weights_array, color = colour, bins = top_bins, histtype='step', lw = 3) axHisty_m.hist(group_y_array, weights=weights_array, color = colour, bins = right_bins, histtype='step', orientation='horizontal', lw = 3) axScatter_m.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=BLACK, label=label) axLegend_m.axis('off') axLegend_m.legend(legend_handles_m, legend_labels_m, loc=6, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True) plot_ref_legend(axScatter_m, max_length, max_marker_size, self.ignore_contig_length) m_out_f = "%s.%s.%s.%s" % (out_f, cov_lib, idx, group.replace("/", "_").replace(" ", "_")) fig_m = plot_legend(fig_m, axLegend_m, m_out_f, self.legend_flag, self.format, self.cumulative_flag) print BtLog.status_d['8'] % "%s.%s" % (m_out_f, self.format) fig_m.savefig("%s.%s" % (m_out_f, self.format), format=self.format) plt.close(fig_m) elif (self.cumulative_flag): axLegend.legend(legend_handles, legend_labels, loc=6, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True) plot_ref_legend(axScatter, max_length, max_marker_size, self.ignore_contig_length) m_out_f = "%s.%s.%s.%s" % (out_f, cov_lib, idx, group.replace("/", "_").replace(" ", "_")) fig.add_axes(axLegend) fig = plot_legend(fig, axLegend, m_out_f, self.legend_flag, self.format, self.cumulative_flag) if not (self.no_title): fig.suptitle(out_f, fontsize=35, verticalalignment='top') print BtLog.status_d['8'] % "%s.%s" % (m_out_f, self.format) fig.savefig("%s.%s" % (m_out_f, self.format), format=self.format) else: pass plot_ref_legend(axScatter, max_length, max_marker_size, self.ignore_contig_length) axLegend.legend(legend_handles, legend_labels, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True, loc=6 ) out_f = "%s.%s" % (out_f, cov_lib) fig.add_axes(axLegend) fig = plot_legend(fig, axLegend, out_f, self.legend_flag, self.format, self.cumulative_flag) if not (self.no_title): fig.suptitle(out_f, fontsize=35, verticalalignment='top') print BtLog.status_d['8'] % "%s.%s" % (out_f, self.format) fig.savefig("%s.%s" % (out_f, self.format), format=self.format) plt.close(fig)
def main(): #main_dir = dirname(__file__) args = docopt(__doc__) fasta_f = args['--infile'] fasta_type = args['--type'] sam_fs = args['--sam'] bam_fs = args['--bam'] cov_fs = args['--cov'] cas_fs = args['--cas'] hit_fs = args['--hitsfile'] prefix = args['--out'] nodesDB_f = args['--db'] names_f = args['--names'] nodes_f = args['--nodes'] taxrules = args['--taxrule'] min_bitscore_diff = float(args['--min_diff']) tax_collision_random = args['--tax_collision_random'] title = args['--title'] # outfile out_f = BtIO.getOutFile("blobDB", prefix, "json") if not (title): title = out_f # coverage if not (fasta_type ) and not bam_fs and not sam_fs and not cov_fs and not cas_fs: BtLog.error('1') cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \ [BtCore.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \ [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \ [BtCore.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)] # taxonomy hit_libs = [ BtCore.HitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs) ] # Create BlobDB object blobDb = BtCore.BlobDb(title) blobDb.version = blobtools.__version__ # Parse FASTA blobDb.parseFasta(fasta_f, fasta_type) # Parse nodesDB OR names.dmp, nodes.dmp nodesDB_default = join(blobtools.DATADIR, "nodesDB.txt") nodesDB, nodesDB_f = BtIO.parseNodesDB(nodes=nodes_f, names=names_f, nodesDB=nodesDB_f, nodesDBdefault=nodesDB_default) blobDb.nodesDB_f = nodesDB_f # Parse similarity hits if (hit_libs): blobDb.parseHits(hit_libs) blobDb.computeTaxonomy(taxrules, nodesDB, min_bitscore_diff, tax_collision_random) else: print BtLog.warn_d['0'] # Parse coverage blobDb.parseCoverage(covLibObjs=cov_libs, no_base_cov=None) # Generating BlobDB and writing to file print BtLog.status_d['7'] % out_f BtIO.writeJson(blobDb.dump(), out_f)
def main(): args = docopt(__doc__) blast_f = args['--blast'] diamond_f = args['--diamond'] uniref_f = args['--uniref'] rnacentral_f = args['--rnacentral'] swissprot_f = args['--swissprot'] taxid = args['--taxid'] force = args['--force'] prefix = args['--out'] out_f, hit_f, map_f, taxid_d = None, None, None, {} # Check if blast_f OR diamond_f is speciefied if not (bool(blast_f) + bool(diamond_f) == 1): BtLog.error('26') elif blast_f: hit_f = blast_f elif diamond_f: hit_f = diamond_f else: pass # Check if taxID or Mapping file is supplied if (taxid): try: taxid = int(taxid) except TypeError: BtLog.error('26') out_f = BtIO.getOutFile(hit_f, prefix, "tax_%s.out" % taxid) taxid_d = defaultdict(lambda: taxid) elif (bool(uniref_f) + bool(rnacentral_f) + bool(swissprot_f) == 1): if uniref_f: print BtLog.status_d['1'] % ("ID-to-taxID Mapping file", uniref_f) taxid_d = BtIO.parseDict(uniref_f, 0, 1) out_f = BtIO.getOutFile(hit_f, prefix, "uniref.out") map_f = uniref_f elif rnacentral_f: print BtLog.status_d['1'] % ("ID-to-taxID Mapping file", rnacentral_f) taxid_d = BtIO.parseDict(rnacentral_f, 0, 3) out_f = BtIO.getOutFile(hit_f, prefix, "rnacentral.out") map_f = rnacentral_f elif swissprot_f: print BtLog.status_d['1'] % ("ID-to-taxID Mapping file", swissprot_f) taxid_d = BtIO.parseDict(swissprot_f, 0, 1) out_f = BtIO.getOutFile(hit_f, prefix, "swissprot.out") map_f = swissprot_f else: pass else: BtLog.error('41') output = [] print BtLog.status_d['1'] % ("hits file", hit_f) with open(hit_f) as fh: for idx, l in enumerate(fh): query_id, bitscore, tax_id, subject_id, rest = None, None, None, None, None line = l.rstrip("\n").split() query_id = line[0] if blast_f: bitscore = line[2] tax_id = line[1] subject_id = line[4] rest = "\t".join(line[2:]) elif diamond_f: bitscore = line[11] subject_id = line[1] rest = "\t".join(line[1:]) if swissprot_f: subject_id = subject_id.split("|")[1] if blast_f and not tax_id == "N/A" and not force: # so that it does not overwrite existing taxIDs print BtLog.warn_d['10'] % (idx + 1, line[0], line[1]) output.append("%s\t%s\t%s\t%s" % (query_id, tax_id, bitscore, rest)) else: try: tax_id = taxid_d[subject_id] except KeyError: BtLog.warn_d['12'] % (subject_id, map_f) tax_id = "N/A" output.append("%s\t%s\t%s\t%s" % (query_id, tax_id, bitscore, rest)) if output: with open(out_f, "w") as fh: print BtLog.status_d['24'] % out_f fh.write("\n".join(output))
def main(): #print data_dir args = docopt(__doc__) blobdb_f = args['--input'] prefix = args['--out'] ranks = args['--rank'] taxrule = args['--taxrule'] hits_flag = args['--hits'] seq_list_f = args['--list'] concoct = args['--concoct'] cov = args['--cov'] notable = args['--notable'] experimental = args['--experimental'] # Does blobdb_f exist ? if not isfile(blobdb_f): BtLog.error('0', blobdb_f) out_f = BtIO.getOutFile(blobdb_f, prefix, None) # Are ranks sane ? if 'all' in ranks: temp_ranks = RANKS[0:-1] ranks = temp_ranks[::-1] else: for rank in ranks: if rank not in RANKS: BtLog.error('9', rank) # Does seq_list file exist? seqs = [] if (seq_list_f): if isfile(seq_list_f): seqs = BtIO.parseList(seq_list_f) else: BtLog.error('0', seq_list_f) # Load BlobDb blobDb = BtCore.BlobDb('new') print BtLog.status_d['9'] % (blobdb_f) blobDb.load(blobdb_f) blobDb.version = blobtools.__version__ # Is taxrule sane and was it computed? if (blobDb.hitLibs) and taxrule not in blobDb.taxrules: BtLog.error('11', taxrule, blobDb.taxrules) # view(s) viewObjs = [] print BtLog.status_d['14'] if not (notable): tableView = BtCore.ViewObj(name="table", out_f=out_f, suffix="table.txt", body=[]) viewObjs.append(tableView) if (experimental): experimentalView = BtCore.ExperimentalViewObj(name = "experimental", view_dir=out_f) viewObjs.append(experimentalView) if (concoct): concoctTaxView = BtCore.ViewObj(name="concoct_tax", out_f=out_f, suffix="concoct_taxonomy_info.csv", body=dict()) viewObjs.append(concoctTaxView) concoctCovView = BtCore.ViewObj(name="concoct_cov", out_f=out_f, suffix="concoct_coverage_info.tsv", body=[]) viewObjs.append(concoctCovView) if (cov): for cov_lib_name, covLibDict in blobDb.covLibs.items(): out_f = BtIO.getOutFile(covLibDict['f'], prefix, None) covView = BtCore.ViewObj(name="covlib", out_f=out_f, suffix="cov", body=[]) blobDb.view(viewObjs=[covView], ranks=None, taxrule=None, hits_flag=None, seqs=None, cov_libs=[cov_lib_name], progressbar=True) if (viewObjs): blobDb.view(viewObjs=viewObjs, ranks=ranks, taxrule=taxrule, hits_flag=hits_flag, seqs=seqs, cov_libs=[], progressbar=True) print BtLog.status_d['19']
def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict): data_dict = {} read_cov_dict = {} max_cov = 0.0 min_cov = 1000.0 cov_lib_dict = self.covLibs cov_lib_names_l = self.covLibs.keys() # does not include cov_sum if len(cov_lib_names_l) > 1: # more than one cov_lib, cov_sum_lib has to be created cov_lib_dict["covsum"] = CovLibObj( "covsum", "covsum", "Sum of cov in %s" % basename(self.title) ).__dict__ # ugly cov_lib_dict["covsum"]["reads_total"] = sum([self.covLibs[x]["reads_total"] for x in self.covLibs]) cov_lib_dict["covsum"]["reads_mapped"] = sum([self.covLibs[x]["reads_mapped"] for x in self.covLibs]) cov_lib_dict["covsum"]["cov_sum"] = sum([self.covLibs[x]["cov_sum"] for x in self.covLibs]) cov_lib_dict["covsum"]["mean_cov"] = cov_lib_dict["covsum"]["cov_sum"] / self.seqs for blob in self.dict_of_blobs.values(): name, gc, length, group = blob["name"], blob["gc"], blob["length"], "" if catcolour_dict: # annotation with categories specified in catcolour group = str(catcolour_dict[name]) elif c_index: # annotation with c_index instead of taxonomic group if taxrule not in self.taxrules: BtLog.error("11", taxrule, self.taxrules) else: group = str(blob["taxonomy"][taxrule][rank]["c_index"]) else: # annotation with taxonomic group if not (taxrule) or taxrule not in self.taxrules: BtLog.warn_d["9"] % (taxrule, self.taxrules) if taxrule in blob["taxonomy"]: group = str(blob["taxonomy"][taxrule][rank]["tax"]) if not group in data_dict: data_dict[group] = { "name": [], "length": [], "gc": [], "covs": {covLib: [] for covLib in cov_lib_dict.keys()}, # includes cov_sum if it exists "reads_mapped": {covLib: 0 for covLib in cov_lib_dict.keys()}, # includes cov_sum if it exists "count": 0, "count_hidden": 0, "count_visible": 0, "span": 0, "span_hidden": 0, "span_visible": 0, } data_dict[group]["count"] = data_dict[group].get("count", 0) + 1 data_dict[group]["span"] = data_dict[group].get("span", 0) + int(length) if ((hide_nohits) and group == "no-hit") or length < min_length: # hidden data_dict[group]["count_hidden"] = data_dict[group].get("count_hidden", 0) + 1 data_dict[group]["span_hidden"] = data_dict[group].get("span_hidden", 0) + int(length) else: # visible data_dict[group]["count_visible"] = data_dict[group].get("count_visible", 0) + 1 data_dict[group]["span_visible"] = data_dict[group].get("span_visible", 0) + int(length) data_dict[group]["name"].append(name) data_dict[group]["length"].append(length) data_dict[group]["gc"].append(gc) cov_sum = 0.0 reads_mapped_sum = 0 for cov_lib in sorted(cov_lib_names_l): cov = float(blob["covs"][cov_lib]) if cov < 0.02: cov = 0.02 if cov < min_cov: min_cov = cov # increase max_cov if cov > max_cov: max_cov = cov # add cov of blob to group data_dict[group]["covs"][cov_lib].append(cov) cov_sum += cov # add readcov if cov_lib in blob["read_cov"]: reads_mapped = blob["read_cov"][cov_lib] data_dict[group]["reads_mapped"][cov_lib] += reads_mapped reads_mapped_sum += reads_mapped if len(cov_lib_names_l) > 1: if cov_sum < 0.02: cov_sum = 0.02 data_dict[group]["covs"]["covsum"].append(cov_sum) if cov_sum > max_cov: max_cov = cov_sum if reads_mapped_sum: data_dict[group]["reads_mapped"]["covsum"] += reads_mapped_sum return data_dict, min_cov, max_cov, cov_lib_dict
def parseBamForFilter(infile, outfile, include, exclude, gzip, do_sort, keep_sorted, sort_threads): ''' checkBam returns reads_total and reads_mapped parse BAM to extract readpairs ''' if not isfile(infile): BtLog.error('0', infile) if do_sort: command = 'samtools sort -@ sort_threads -n -O bam -T temp -o %s.readsorted.bam %s' % (infile, infile) runCmd(command=command, wait=True) infile = "%s.readsorted.bam" % infile reads_total, reads_mapped = checkBam(infile) progress_unit = int(reads_mapped/1000) command = "samtools view -f 1 -F 1024 -F 256 -F 2048 %s" % infile seen_reads = 0 read_pair_count, read_pair_seqs, read_pair_out_fs = init_read_pairs(outfile, include, exclude) read_pair_out_fhs = [] used_fhs = {} iterator = runCmd(command=command) read_pair_type = None if include: sequence_to_type_dict = defaultdict(lambda: 'Ex') for incl in include: sequence_to_type_dict[incl] = 'In' sequence_to_type_dict['*'] = 'Un' elif exclude: sequence_to_type_dict = defaultdict(lambda: 'In') for excl in exclude: sequence_to_type_dict[excl] = 'Ex' sequence_to_type_dict['*'] = 'Un' else: sequence_to_type_dict = defaultdict(lambda: 'In') sequence_to_type_dict['*'] = 'Un' for l in iterator: read1 = l.split() try: seen_reads += 2 read2 = next(iterator).split() read_pair_type = "".join(sorted([sequence_to_type_dict[read1[2]], sequence_to_type_dict[read2[2]]])) print_bam(read_pair_out_fs, read_pair_type, read1, read2) read_pair_seqs[read_pair_type] += get_read_pair_seqs(read1, read2) read_pair_count[read_pair_type] += 1 BtLog.progress(seen_reads, progress_unit, reads_total) if seen_reads % progress_unit == 0: used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs) read_pair_seqs = {read_pair_type : tuple() for read_pair_type in read_pair_count} except StopIteration: print BtLog.warn_d['11'] used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs) close_fhs(used_fhs) # info log info_string = [] info_string.append(('Total pairs', "{:,}".format(int(seen_reads/2)), '{0:.1%}'.format(1.00))) for read_pair_type, count in read_pair_count.items(): info_string.append((read_pair_type + ' pairs', "{:,}".format(count), '{0:.1%}'.format(count/int(seen_reads/2)))) info_out_f = getOutFile(outfile, None, "info.txt") with open(info_out_f, 'w') as info_fh: print BtLog.status_d['24'] % info_out_f info_fh.write(get_table(info_string)) # gzip if gzip: if not which('gzip'): BtLog.error('43') for out_f in used_fhs: print BtLog.status_d['25'] % out_f runCmd(command="gzip -f " + out_f, wait=True) if not int(reads_total) == int(seen_reads): print BtLog.warn_d['3'] % (reads_total, seen_reads) if do_sort and not keep_sorted: os.remove(infile) return 1
def check_input(args): blobdb_f = args['--infile'] rank = args['--rank'] c_index = args['--cindex'] min_length = int(args['--length']) multiplot = args['--multiplot'] hide_nohits = args['--nohit'] out_prefix = args['--out'] max_group_plot = int(args['--plotgroups']) sort_order = args['--sort'] taxrule = args['--taxrule'] hist_type = args['--hist'] no_title = args['--notitle'] ignore_contig_length = args['--noscale'] labels = args['--label'] colour_f = args['--colours'] exclude_groups = args['--exclude'] format = args['--format'] no_plot_blobs = args['--noblobs'] no_plot_reads = args['--noreads'] refcov_f = args['--refcov'] catcolour_f = args['--catcolour'] legend_flag = args['--legend'] cumulative_flag = args['--cumulative'] cov_lib_selection = args['--lib'] if 'blobplot' in args or 'covplot' in args: # Are ranks sane ? if rank not in BtTax.RANKS: BtLog.error('9', rank) # is taxrule provided? if taxrule not in BtTax.TAXRULES: BtLog.error('8', taxrule) # Are sort_order and hist_type sane? if not sort_order in ['span', 'count']: BtLog.error('14', sort_order) if not hist_type in ['span', 'count']: BtLog.error('15', hist_type) if (catcolour_f) and (c_index): BtLog.error('24') if (cumulative_flag) and (multiplot): BtLog.error('32') return args
def check_input(args): blobdb_f = args['--infile'] rank = args['--rank'] c_index = args['--cindex'] min_length = int(args['--length']) multiplot = args['--multiplot'] hide_nohits = args['--nohit'] out_prefix = args['--out'] max_group_plot = int(args['--plotgroups']) sort_order = args['--sort'] sort_first = args['--sort_first'] taxrule = args['--taxrule'] hist_type = args['--hist'] no_title = args['--notitle'] ignore_contig_length = args['--noscale'] labels = args['--label'] colour_f = args['--colours'] exclude_groups = args['--exclude'] format = args['--format'] no_plot_blobs = args['--noblobs'] no_plot_reads = args['--noreads'] refcov_f = args['--refcov'] catcolour_f = args['--catcolour'] legend_flag = args['--legend'] cumulative_flag = args['--cumulative'] cov_lib_selection = args['--lib'] #Convert sort_first to a list if sort_first: args['--sort_first'] = sort_first.split(',') else: args['--sort_first'] = () if 'blobplot' in args or 'covplot' in args: # Are ranks sane ? if rank not in BtTax.RANKS: BtLog.error('9', rank) # is taxrule provided? if taxrule not in BtTax.TAXRULES: BtLog.error('8', taxrule) # Are sort_order and hist_type sane? if not sort_order in ['span', 'count']: BtLog.error('14', sort_order) if not hist_type in ['span', 'count']: BtLog.error('15', hist_type) if (catcolour_f) and (c_index): BtLog.error('24') if (cumulative_flag) and (multiplot): BtLog.error('32') return args
def parseBamForFilter(infile, outfile, include, exclude, gzip, do_sort, keep_sorted, sort_threads): ''' checkBam returns reads_total and reads_mapped parse BAM to extract readpairs ''' if not isfile(infile): BtLog.error('0', infile) if do_sort: command = 'samtools sort -@ sort_threads -n -O bam -T temp -o %s.readsorted.bam %s' % ( infile, infile) runCmd(command=command, wait=True) infile = "%s.readsorted.bam" % infile reads_total, reads_mapped = checkBam(infile) progress_unit = int(reads_mapped / 1000) command = "samtools view -f 1 -F 1024 -F 256 -F 2048 %s" % infile seen_reads = 0 read_pair_count, read_pair_seqs, read_pair_out_fs = init_read_pairs( outfile, include, exclude) read_pair_out_fhs = [] used_fhs = {} iterator = runCmd(command=command) read_pair_type = None if include: sequence_to_type_dict = defaultdict(lambda: 'Ex') for incl in include: sequence_to_type_dict[incl] = 'In' sequence_to_type_dict['*'] = 'Un' elif exclude: sequence_to_type_dict = defaultdict(lambda: 'In') for excl in exclude: sequence_to_type_dict[excl] = 'Ex' sequence_to_type_dict['*'] = 'Un' else: sequence_to_type_dict = defaultdict(lambda: 'In') sequence_to_type_dict['*'] = 'Un' for l in iterator: read1 = l.split() try: seen_reads += 2 read2 = next(iterator).split() read_pair_type = "".join( sorted([ sequence_to_type_dict[read1[2]], sequence_to_type_dict[read2[2]] ])) print_bam(read_pair_out_fs, read_pair_type, read1, read2) read_pair_seqs[read_pair_type] += get_read_pair_seqs(read1, read2) read_pair_count[read_pair_type] += 1 BtLog.progress(seen_reads, progress_unit, reads_total) if seen_reads % progress_unit == 0: used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs) read_pair_seqs = { read_pair_type: tuple() for read_pair_type in read_pair_count } except StopIteration: print BtLog.warn_d['11'] used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs) close_fhs(used_fhs) # info log info_string = [] info_string.append(('Total pairs', "{:,}".format(int(seen_reads / 2)), '{0:.1%}'.format(1.00))) for read_pair_type, count in read_pair_count.items(): info_string.append((read_pair_type + ' pairs', "{:,}".format(count), '{0:.1%}'.format(count / int(seen_reads / 2)))) info_out_f = getOutFile(outfile, None, "info.txt") with open(info_out_f, 'w') as info_fh: print BtLog.status_d['24'] % info_out_f info_fh.write(get_table(info_string)) # gzip if gzip: if not which('gzip'): BtLog.error('43') for out_f in used_fhs: print BtLog.status_d['25'] % out_f runCmd(command="gzip -f " + out_f, wait=True) if not int(reads_total) == int(seen_reads): print BtLog.warn_d['3'] % (reads_total, seen_reads) if do_sort and not keep_sorted: os.remove(infile) return 1