def readBam(infile, set_of_blobs): reads_total, reads_mapped = checkBam(infile) progress_unit = int(int(reads_mapped)/1000) + 1 # lazy fix base_cov_dict = {} read_cov_dict = {} cigar_match_re = re.compile(r"(\d+)M") # only gets digits before M's # execute samtools to get only mapped reads command = "samtools view -F 4 " + infile # ADD flag picard -F 1028 to not consider optical duplicates #command = "samtools view -F 1028 " + infile # only one counter since only yields mapped reads parsed_reads = 0 for line in runCmd(command): match = line.split("\t") if match >= 11: seq_name = match[2] base_cov = sum([int(matching) for matching in cigar_match_re.findall(match[5])]) if (base_cov): parsed_reads += 1 if seq_name not in set_of_blobs: print BtLog.warn_d['2'] % (seq_name, infile) else: base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov read_cov_dict[seq_name] = read_cov_dict.get(seq_name, 0) + 1 BtLog.progress(parsed_reads, progress_unit, reads_total) BtLog.progress(reads_total, progress_unit, reads_total) if not int(reads_mapped) == int(parsed_reads): print warn_d['3'] % (reads_mapped, parsed_reads) return base_cov_dict, reads_total, parsed_reads, read_cov_dict
def readCas(infile, order_of_blobs): seqs_total, reads_total, reads_mapped = checkCas(infile) progress_unit = int(len(order_of_blobs)/100) cas_line_re = re.compile(r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})") command = "clc_mapping_info -n " + infile cov_dict = {} read_cov_dict = {} seqs_parsed = 0 if (runCmd(command)): for line in runCmd(command): cas_line_match = cas_line_re.search(line) if cas_line_match: idx = int(cas_line_match.group(1)) - 1 # -1 because index of contig list starts with zero try: name = order_of_blobs[idx] reads = int(cas_line_match.group(3)) cov = float(cas_line_match.group(6)) cov_dict[name] = cov read_cov_dict[name] = reads seqs_parsed += 1 except: pass BtLog.progress(seqs_parsed, progress_unit, seqs_total) BtLog.progress(seqs_total, progress_unit, seqs_total) return cov_dict, reads_total, reads_mapped, read_cov_dict
def parseFasta(self, fasta_f, fasta_type): print BtLog.status_d['1'] % ('FASTA', fasta_f) self.assembly_f = abspath(fasta_f) if (fasta_type): # Set up CovLibObj for coverage in assembly header self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type, fasta_f) for name, seq in BtIO.readFasta(fasta_f): blObj = BlObj(name, seq) if not blObj.name in self.dict_of_blobs: self.seqs += 1 self.length += blObj.length self.n_count += blObj.n_count if (fasta_type): cov = BtIO.parseCovFromHeader(fasta_type, blObj.name) self.covLibs[fasta_type].cov_sum += cov blObj.addCov(fasta_type, cov) self.order_of_blobs.append(blObj.name) self.dict_of_blobs[blObj.name] = blObj else: BtLog.error('5', blObj.name) if self.seqs == 0 or self.length == 0: BtLog.error('1')
def readBam(infile, fasta_headers): reads_total, reads_mapped = checkBam(infile) progress_unit = int(int(reads_total)/1000) base_cov_dict = {} cigar_match_re = re.compile(r"(\d+)M") # only gets digits before M's read_cov_dict = {} # execute samtools to get only mapped reads from primary alignment command = "samtools view -q " + str(mq) + " -F 256 -F 4 " + infile # only one counter since only yields mapped reads parsed_reads = 0 for line in runCmd(command): match = line.split("\t") seq_name = match[2] if seq_name not in fasta_headers: print BtLog.warn_d['2'] % (seq_name, infile) else: read_cov_dict[seq_name] = read_cov_dict.get(seq_name, 0) + 1 if not (no_base_cov_flag): base_cov = sum([int(matching) for matching in cigar_match_re.findall(match[5])]) if (base_cov): base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov parsed_reads += 1 BtLog.progress(parsed_reads, progress_unit, reads_total) BtLog.progress(reads_total, progress_unit, reads_total) return base_cov_dict, read_cov_dict, reads_total, parsed_reads
def parseCatColour(catcolour_f): catcolour_dict = {} with open(catcolour_f) as fh: for l in fh: try: seq_name, category = l.rstrip("\n").split(",") catcolour_dict[seq_name] = category except: BtLog.error('23', catcolour_f) return catcolour_dict
def writeNodesDB(nodesDB, nodesDB_f): nodes_count = nodesDB['nodes_count'] i = 0 with open(nodesDB_f, 'w') as fh: fh.write("# nodes_count = %s\n" % nodes_count) for node in nodesDB: if not node == "nodes_count": i += 1 BtLog.progress(i, 1000, nodes_count) fh.write("%s\t%s\t%s\t%s\n" % (node, nodesDB[node]['rank'], nodesDB[node]['name'], nodesDB[node]['parent']))
def parseRefCov(refcov_f): refcov_dict = {} with open(refcov_f) as fh: for l in fh: try: cov_lib, reads_total_ref, reads_mapped_ref = l.split(",") refcov_dict[cov_lib] = { 'reads_total' : int(reads_total_ref), 'reads_mapped' : int(reads_mapped_ref) } except: BtLog.error('21', refcov_f) return refcov_dict
def computeTaxonomy(self, taxrules, nodesDB): tree_lists = BtTax.getTreeList(self.set_of_taxIds, nodesDB) self.lineages = BtTax.getLineages(tree_lists, nodesDB) self.taxrules = taxrules i = 0 for blObj in self.dict_of_blobs.values(): i += 1 BtLog.progress(i, 100, self.seqs) for taxrule in taxrules: if (blObj.hits): blObj.taxonomy[taxrule] = BtTax.taxRule(taxrule, blObj.hits, self.lineages) else: blObj.taxonomy[taxrule] = BtTax.noHit()
def parseCovFile(cov_f): cov_dict = {} with open(cov_f) as fh: for l in fh: try: seq_name, cov = l.rstrip("\n").split("\t") if float(cov) < 0.02: cov_dict[seq_name] = 0.02 else: cov_dict[seq_name] = float(cov) except: BtLog.error('25', cov_f) return cov_dict
def readNodesDB(nodesDB_f): nodesDB = {} nodes_count = 0 i = 0 with open(nodesDB_f) as fh: for line in fh: if line.startswith("#"): nodes_count = int(line.lstrip("# nodes_count = ").rstrip("\n")) else: i += 1 node, rank, name, parent = line.rstrip("\n").split("\t") nodesDB[node] = {'rank' : rank, 'name' : name, 'parent' : parent} BtLog.progress(i, 1000, nodes_count) return nodesDB
def parse_labels(labels): label_d = {} name, groups = '', '' if (labels): try: for label in labels: name, groups = str(label).split("=") if "," in groups: for group in groups.split(","): label_d[group] = name else: label_d[groups] = name except: BtLog.error('17', labels) return label_d
def checkBam(infile): print BtLog.status_d['10'] if not (which('samtools')): BtLog.error('7') reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped") reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total") reads_total, reads_mapped = 0, 0 output = '' command = "samtools flagstat " + infile for line in runCmd(command): output += line reads_mapped = int(reads_mapped_re.search(output).group(1)) reads_total = int(reads_total_re.search(output).group(1)) print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total)) return reads_total, reads_mapped
def readCov(infile, set_of_blobs): old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)") base_cov_dict = {} cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)") reads_total = 0 reads_mapped = 0 read_cov_dict = {} seqs_parsed = 0 progress_unit = 1 old_format = 1 with open(infile) as fh: for line in fh: if line.startswith("#"): old_format = 0 if old_format == 0: if line.startswith("# Total Reads"): reads_total = int(line.split(" = ")[1]) elif line.startswith("# Mapped Reads"): reads_mapped = int(line.split(" = ")[1]) elif line.startswith("# Unmapped Reads"): pass elif line.startswith("# Parameters"): pass elif line.startswith("# contig_id"): pass else: match = cov_line_re.search(line) if match: seqs_parsed += 1 name, read_cov, base_cov = match.group(1), int(match.group(2)), float(match.group(3)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name, infile) read_cov_dict[name] = read_cov base_cov_dict[name] = base_cov else: match = old_cov_line_re.search(line) if match: seqs_parsed += 1 name, base_cov = match.group(1), float(match.group(2)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name, infile) base_cov_dict[name] = base_cov BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs)) #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs)) return base_cov_dict, reads_total, reads_mapped, read_cov_dict
def checkCas(infile): print BtLog.status_d['12'] if not (which('clc_mapping_info')): BtLog.error('20') seqs_total_re = re.compile(r"\s+Contigs\s+(\d+)") reads_total_re = re.compile(r"\s+Reads\s+(\d+)") reads_mapping_re = re.compile(r"\s+Mapped reads\s+(\d+)\s+(\d+.\d+)\s+\%") seqs_total, reads_total, reads_mapping, mapping_rate = 0, 0, 0, 0.0 output = '' command = "clc_mapping_info -s " + infile for line in runCmd(command): output += line seqs_total = int(seqs_total_re.search(output).group(1)) reads_mapped = int(reads_mapping_re.search(output).group(1)) reads_total = int(reads_total_re.search(output).group(1)) print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total)) return seqs_total, reads_total, reads_mapped
def readTax(infile, set_of_blobs): ''' If more fields need to be parsed: - change hit_line_re - catch matches in variables - add as key-value pairs to hitDict ''' hit_line_re = re.compile(r"^(\S+)\s+(\d+)[\;?\d+]*\s+(\d+\.*\d*)") # TEST TEST , if not split it afterwards with open(infile) as fh: for line in fh: match = hit_line_re.search(line) if match: hitDict = { 'name' : match.group(1), 'taxId' : match.group(2), # string because if int, conversion is a nightmare ... 'score' : float(match.group(3)) } if hitDict['name'] not in set_of_blobs: BtLog.error('19', hitDict['name'], infile) if hitDict['taxId'] == 'N/A': BtLog.error('22', infile) yield hitDict
def parseCovFile(cov_f): cov_dict = {} old_format = 1 seq_name = '' cov = 0.0 with open(cov_f) as fh: for l in fh: if l.startswith("#"): old_format = 0 else: try: field = l.rstrip("\n").split("\t") if not (old_format): seq_name, cov = field[0], field[2] else: seq_name, cov = field[0], field[1] if float(cov) < 0.02: cov_dict[seq_name] = 0.02 else: cov_dict[seq_name] = float(cov) except: BtLog.error('25', cov_f) return cov_dict
def getNodesDB(**kwargs): ''' Parsing names.dmp and nodes.dmp into the 'nodes_db' dict of dicts that gets JSON'ed into blobtools/data/nodes_db.json if this file does not exist. This file is used if neither "--names" and "--nodes" nor "--db" is specified. ''' nodesDB = {} nodesDB_f = '' if (kwargs['names'] and kwargs['nodes']): print BtLog.status_d['3'] % (kwargs['nodes'], kwargs['names']) nodesDB = {} nodes_count = 0 with open(kwargs['nodes']) as fh: for line in fh: nodes_col = line.split("\t") node = {} node_id = nodes_col[0] node['parent'] = nodes_col[2] node['rank'] = nodes_col[4] nodesDB[node_id] = node nodes_count += 1 with open(kwargs['names']) as fh: for line in fh: names_col = line.split("\t") if names_col[6] == "scientific name": nodesDB[names_col[0]]['name'] = names_col[2] nodesDB_f = kwargs['nodesDB'] nodesDB['nodes_count'] = nodes_count elif(kwargs['nodesDB']): print BtLog.status_d['4'] % (kwargs['nodesDB']) nodesDB = readNodesDB(kwargs['nodesDB']) nodesDB_f = kwargs['nodesDB'] else: BtLog.error('3') return nodesDB, nodesDB_f
def readCov(infile, set_of_blobs): cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)") cov_dict = {} seqs_parsed = 0 progress_unit = int(len(set_of_blobs)/100) with open(infile) as fh: for line in fh: BtLog.progress(seqs_parsed, 10, len(set_of_blobs)) match = cov_line_re.search(line) if match: seqs_parsed += 1 name, cov = match.group(1), float(match.group(2)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name, infile) cov_dict[name] = cov BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs)) BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs)) return cov_dict
def parseNodesDB(**kwargs): ''' Parsing names.dmp and nodes.dmp into the 'nodes_db' dict of dicts that gets JSON'ed into blobtools/data/nodes_db.json if this file does not exist. Nodes_db.json is used if neither "--names" and "--nodes" nor "--db" is specified. If all three are specified and "--db" does not exist, then write 'nodes_db' to file specified by "--db". If all three are specified and "--db" exists, error out. ''' nodesDB = {} names_f = kwargs['names'] nodes_f = kwargs['nodes'] nodesDB_f = kwargs['nodesDB'] nodesDB_default = kwargs['nodesDBdefault'] if (nodes_f and names_f): if not isfile(names_f): BtLog.error('0', names_f) if not isfile(nodes_f): BtLog.error('0', nodes_f) if (nodesDB_f): if isfile(nodesDB_f): BtLog.error('47', nodesDB_f) BtLog.status_d['27'] % (nodesDB_f, nodes_f, names_f) else: print(BtLog.status_d['3'] % (nodes_f, names_f)) try: nodesDB = readNamesNodes(names_f, nodes_f) except: BtLog.error('3', nodes_f, names_f) elif (nodesDB_f): if not isfile(nodesDB_f): BtLog.error('0', nodesDB_f) print(BtLog.status_d['4'] % (nodesDB_f)) try: nodesDB = readNodesDB(nodesDB_f) except: BtLog.error('27', nodesDB_f) elif (nodesDB_default): if not isfile(nodesDB_default): BtLog.error('28') print(BtLog.status_d['4'] % (nodesDB_default)) try: nodesDB = readNodesDB(nodesDB_default) except: BtLog.error('27', nodesDB_default) # Write nodesDB if names, nodes, nodesDB all given and nodesDB does not # exist. Otherwise, write to nodesDB_default if it does not exist, unless # nodesDB given, then do nothing with nodesDB_default. if (nodes_f and names_f and nodesDB_f): print(BtLog.status_d['28'] % nodesDB_f) writeNodesDB(nodesDB, nodesDB_f) elif (not nodesDB_f and not isfile(nodesDB_default)): nodesDB_f = nodesDB_default print(BtLog.status_d['5'] % nodesDB_f) writeNodesDB(nodesDB, nodesDB_f) return nodesDB, nodesDB_f
def main(): #print data_dir args = docopt(__doc__) blobdb_f = args['--input'] prefix = args['--out'] ranks = args['--rank'] taxrule = args['--taxrule'] hits_flag = args['--hits'] seq_list_f = args['--list'] concoct = args['--concoct'] cov = args['--cov'] notable = args['--notable'] experimental = args['--experimental'] # Does blobdb_f exist ? if not isfile(blobdb_f): BtLog.error('0', blobdb_f) out_f = BtIO.getOutFile(blobdb_f, prefix, None) # Are ranks sane ? if 'all' in ranks: temp_ranks = RANKS[0:-1] ranks = temp_ranks[::-1] else: for rank in ranks: if rank not in RANKS: BtLog.error('9', rank) # Does seq_list file exist? seqs = [] if (seq_list_f): if isfile(seq_list_f): seqs = BtIO.parseList(seq_list_f) else: BtLog.error('0', seq_list_f) # Load BlobDb blobDb = BtCore.BlobDb('new') print BtLog.status_d['9'] % (blobdb_f) blobDb.load(blobdb_f) blobDb.version = blobtools.__version__ # Is taxrule sane and was it computed? if (blobDb.hitLibs) and taxrule not in blobDb.taxrules: BtLog.error('11', taxrule, blobDb.taxrules) # view(s) viewObjs = [] print BtLog.status_d['14'] if not (notable): tableView = None if len(blobDb.hitLibs) > 1: tableView = BtCore.ViewObj(name="table", out_f=out_f, suffix="%s.table.txt" % (taxrule), body=[]) else: tableView = BtCore.ViewObj(name="table", out_f=out_f, suffix="table.txt", body=[]) viewObjs.append(tableView) if (experimental): meta = {} if isfile(experimental): meta = BtIO.readYaml(experimental) experimentalView = BtCore.ExperimentalViewObj(name="experimental", view_dir=out_f, blobDb=blobDb, meta=meta) viewObjs.append(experimentalView) if (concoct): concoctTaxView = None concoctCovView = None if len(blobDb.hitLibs) > 1: concoctTaxView = BtCore.ViewObj( name="concoct_tax", out_f=out_f, suffix="%s.concoct_taxonomy_info.csv" % (taxrule), body=dict()) concoctCovView = BtCore.ViewObj( name="concoct_cov", out_f=out_f, suffix="%s.concoct_coverage_info.tsv" % (taxrule), body=[]) else: concoctTaxView = BtCore.ViewObj(name="concoct_tax", out_f=out_f, suffix="concoct_taxonomy_info.csv", body=dict()) concoctCovView = BtCore.ViewObj(name="concoct_cov", out_f=out_f, suffix="concoct_coverage_info.tsv", body=[]) viewObjs.append(concoctTaxView) viewObjs.append(concoctCovView) if (cov): for cov_lib_name, covLibDict in blobDb.covLibs.items(): out_f = BtIO.getOutFile(covLibDict['f'], prefix, None) covView = BtCore.ViewObj(name="covlib", out_f=out_f, suffix="cov", body=[]) blobDb.view(viewObjs=[covView], ranks=None, taxrule=None, hits_flag=None, seqs=None, cov_libs=[cov_lib_name], progressbar=True) if (viewObjs): blobDb.view(viewObjs=viewObjs, ranks=ranks, taxrule=taxrule, hits_flag=hits_flag, seqs=seqs, cov_libs=[], progressbar=True) print BtLog.status_d['19']
for name in readFasta(infile): fasta_order.append(name) fasta_dict[name] = 0.0 return fasta_dict, fasta_order if __name__ == '__main__': main_dir = dirname(__file__) #print data_dir args = docopt(__doc__) assembly_f = args['--infile'] cov_fs = args['--cov'] fasta_dict = {} fasta_order = [] if not isfile(assembly_f): BtLog.error('0', assembly_f) else: fasta_dict, fasta_order = parseFasta(assembly_f) for cov_f in cov_fs: if not isfile(cov_f): BtLog.error('0', cov_f) else: lib_cov_dict = BtPlot.parseCovFile(cov_f) for name in fasta_order: fasta_dict[name] = fasta_dict.get(name, 0.0) + lib_cov_dict[name] for name in fasta_order: print "%s\t%s" % (name, fasta_dict[name])
def main(): args = docopt(__doc__) out_f, hit_f, map_f, taxid_d = None, None, None, {} hit_f = args['--hit_file'] hit_col_qseqid = args['--hit_column_qseqid'] hit_col_sseqid = args['--hit_column_sseqid'] hit_col_score = args['--hit_column_score'] map_f = args['--taxid_mapping_file'] map_col_sseqid = args['--map_col_sseqid'] map_col_taxid = args['--map_col_taxid'] #custom_f = args['--custom'] custom_taxid = args['--custom_taxid'] #custom_score = args['--custom_score'] prefix = args['--out'] try: hit_col_qseqid = int(hit_col_qseqid) hit_col_sseqid = int(hit_col_sseqid) hit_col_score = int(hit_col_score) except ValueError: BtLog.error('41' % ( "--hit_column_qseqid, --hit_column_sseqid and --hit_column_score")) if custom_taxid: try: custom_taxid = int(custom_taxid) except TypeError: BtLog.error('26') out_f = BtIO.getOutFile(hit_f, prefix, "taxID_%s.out" % custom_taxid) taxid_d = defaultdict(lambda: custom_taxid) elif map_f: if map_col_sseqid and map_col_taxid: try: map_col_sseqid = int(map_col_sseqid) map_col_taxid = int(map_col_taxid) except ValueError: BtLog.error('44') print(BtLog.status_d['1'] % ("Mapping file", map_f)) taxid_d = BtIO.parseDict(map_f, map_col_sseqid, map_col_taxid) out_f = BtIO.getOutFile(hit_f, prefix, "taxified.out") else: BtLog.error('44') else: BtLog.error('41') output = [] print(BtLog.status_d['1'] % ("similarity search result", hit_f)) with open(hit_f) as fh: for idx, line in enumerate(fh): col = line.rstrip("\n").split() qseqid = col[hit_col_qseqid] sseqid = col[hit_col_sseqid] score = col[hit_col_score] tax_id = None if custom_taxid: tax_id = taxid_d[sseqid] else: if sseqid not in taxid_d: BtLog.warn_d['12'] % (sseqid, map_f) tax_id = taxid_d.get(sseqid, "N/A") output.append("%s\t%s\t%s\t%s" % (qseqid, tax_id, score, sseqid)) if output: with open(out_f, "w") as fh: print(BtLog.status_d['24'] % out_f) fh.write("\n".join(output) + "\n")
def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict): data_dict = {} read_cov_dict = {} max_cov = 0.0 min_cov = 1000.0 cov_lib_dict = self.covLibs cov_lib_names_l = self.covLibs.keys() # does not include cov_sum if len(cov_lib_names_l) > 1: # more than one cov_lib, cov_sum_lib has to be created cov_lib_dict['covsum'] = CovLibObj( 'covsum', 'covsum', 'Sum of cov in %s' % basename(self.title)).__dict__ # ugly cov_lib_dict['covsum']['reads_total'] = sum( [self.covLibs[x]['reads_total'] for x in self.covLibs]) cov_lib_dict['covsum']['reads_mapped'] = sum( [self.covLibs[x]['reads_mapped'] for x in self.covLibs]) cov_lib_dict['covsum']['cov_sum'] = sum( [self.covLibs[x]['cov_sum'] for x in self.covLibs]) cov_lib_dict['covsum'][ 'mean_cov'] = cov_lib_dict['covsum']['cov_sum'] / self.seqs for blob in self.dict_of_blobs.values(): name, gc, length, group = blob['name'], blob['gc'], blob[ 'length'], '' if (catcolour_dict ): # annotation with categories specified in catcolour group = str(catcolour_dict[name]) elif (c_index ): # annotation with c_index instead of taxonomic group if taxrule not in self.taxrules: BtLog.error('11', taxrule, self.taxrules) else: group = str(blob['taxonomy'][taxrule][rank]['c_index']) else: # annotation with taxonomic group if not (taxrule) or taxrule not in self.taxrules: BtLog.warn_d['9'] % (taxrule, self.taxrules) if taxrule in blob['taxonomy']: group = str(blob['taxonomy'][taxrule][rank]['tax']) if not group in data_dict: data_dict[group] = { 'name': [], 'length': [], 'gc': [], 'covs': {covLib: [] for covLib in cov_lib_dict.keys() }, # includes cov_sum if it exists 'reads_mapped': {covLib: 0 for covLib in cov_lib_dict.keys() }, # includes cov_sum if it exists 'count': 0, 'count_hidden': 0, 'count_visible': 0, 'span': 0, 'span_hidden': 0, 'span_visible': 0, } data_dict[group]['count'] = data_dict[group].get('count', 0) + 1 data_dict[group]['span'] = data_dict[group].get('span', 0) + int(length) if ((hide_nohits) and group == 'no-hit') or length < min_length: # hidden data_dict[group]['count_hidden'] = data_dict[group].get( 'count_hidden', 0) + 1 data_dict[group]['span_hidden'] = data_dict[group].get( 'span_hidden', 0) + int(length) else: # visible data_dict[group]['count_visible'] = data_dict[group].get( 'count_visible', 0) + 1 data_dict[group]['span_visible'] = data_dict[group].get( 'span_visible', 0) + int(length) data_dict[group]['name'].append(name) data_dict[group]['length'].append(length) data_dict[group]['gc'].append(gc) cov_sum = 0.0 reads_mapped_sum = 0 for cov_lib in sorted(cov_lib_names_l): cov = float(blob['covs'][cov_lib]) if cov < 0.1: cov = 0.1 if cov < min_cov: min_cov = cov # increase max_cov if cov > max_cov: max_cov = cov # add cov of blob to group data_dict[group]['covs'][cov_lib].append(cov) cov_sum += cov # add readcov if cov_lib in blob['read_cov']: reads_mapped = blob['read_cov'][cov_lib] data_dict[group]['reads_mapped'][ cov_lib] += reads_mapped reads_mapped_sum += reads_mapped if len(cov_lib_names_l) > 1: if cov_sum <= 0.1 * len( cov_lib_names_l): # puts no-cov contigs at 0.1 cov_sum = 0.1 data_dict[group]['covs']['covsum'].append(cov_sum) if cov_sum > max_cov: max_cov = cov_sum if (reads_mapped_sum): data_dict[group]['reads_mapped'][ 'covsum'] += reads_mapped_sum return data_dict, min_cov, max_cov, cov_lib_dict
def view(self, **kwargs): # arguments viewObjs = kwargs['viewObjs'] ranks = kwargs['ranks'] taxrule = kwargs['taxrule'] hits_flag = kwargs['hits_flag'] seqs = kwargs['seqs'] cov_libs = kwargs['cov_libs'] progress_bar = kwargs['progressbar'] # Default sequences if no subset if not (seqs): seqs = self.order_of_blobs # Default cov_libs if no subset cov_lib_names = cov_libs if not (cov_libs): cov_lib_names = [covLib for covLib in self.covLibs] tax_lib_names = [taxLib for taxLib in sorted(self.hitLibs)] lineages = self.lineages # setup for viewObj in viewObjs: if viewObj.name == 'table': viewObj.header = self.getTableHeader(taxrule, ranks, hits_flag, cov_lib_names) if viewObj.name == 'concoct_cov': viewObj.header = self.getConcoctCovHeader(cov_lib_names) if viewObj.name == 'covlib': viewObj.header = self.getCovHeader(cov_lib_names) if viewObj.name == 'experimental': viewObj.covs = {cov_lib: [] for cov_lib in cov_lib_names} viewObj.covs["covsum"] = [] for taxrule in self.taxrules: viewObj.tax[taxrule] = {rank: [] for rank in BtTax.RANKS} # bodies for i, seq in enumerate(seqs): if (progress_bar): BtLog.progress(i, 1000, len(seqs)) blob = self.dict_of_blobs[seq] for viewObj in viewObjs: if viewObj.name == 'table': viewObj.body.append( self.getTableLine(blob, taxrule, ranks, hits_flag, cov_lib_names, tax_lib_names, lineages)) if viewObj.name == 'concoct_cov': viewObj.body.append( self.getConcoctCovLine(blob, cov_lib_names)) if viewObj.name == 'experimental': viewObj.names.append(blob['name']) viewObj.gc.append(blob['gc']) viewObj.length.append(blob['length']) cov_sum = 0.0 for cov_lib in blob['covs']: viewObj.covs[cov_lib].append(blob['covs'][cov_lib]) cov_sum += blob['covs'][cov_lib] viewObj.covs['covsum'].append(cov_sum) for taxrule in blob['taxonomy']: for rank in blob['taxonomy'][taxrule]: viewObj.tax[taxrule][rank].append( blob['taxonomy'][taxrule][rank]['tax']) if viewObj.name == 'concoct_tax': for rank in ranks: if not rank in viewObj.body: viewObj.body[rank] = [] viewObj.body[rank].append( self.getConcoctTaxLine(blob, rank, taxrule)) if viewObj.name == 'covlib': viewObj.body.append(self.getCovLine(blob, cov_lib_names)) if (progress_bar): BtLog.progress(len(seqs), 1000, len(seqs)) for viewObj in viewObjs: viewObj.output()
def validate_input_create(main_dir, args): ''' Accepts: - main_dir - docopt args Returns: - title - fasta_f - fasta_type - cov_libs - hit_libs - nodesDB_f - taxrules - out_f ''' ASSEMBLY_TYPES = [None, 'spades', 'soap', 'abyss', 'velvet'] fasta_f = args['--infile'] fasta_type = args['--type'] sam_fs = args['--sam'] bam_fs = args['--bam'] cov_fs = args['--cov'] cas_fs = args['--cas'] hit_fs = args['--taxfile'] out_f = args['--out'] if (out_f): out_f = "%s.%s" % (os.path.basename(out_f), "BlobDB.json") else: out_f = "%s" % ("BlobDB.json") nodesDB_f = args['--db'] names_f = args['--names'] nodes_f = args['--nodes'] taxrules = args['--taxrule'] title = args['--title'] if (args['--title']) else out_f # Do files exist ? files = [x for x in list([fasta_f] + sam_fs + bam_fs + cov_fs + cas_fs + [names_f] + [nodes_f] + hit_fs) if x is not None] for f in files: if not os.path.isfile(f): BtLog.error('0', f) # Is taxonomy provided? if nodesDB_f == "data/nodesDB.txt": nodesDB_f = os.path.join(main_dir, nodesDB_f) if not os.path.isfile(nodesDB_f) and not ((names_f) and (nodes_f)): BtLog.error('3') if not (hit_fs): BtLog.error('18') # can FASTA parser deal with assemblies if not fasta_type in ASSEMBLY_TYPES: BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:])) # Is coverage provided? if not (fasta_type) and not bam_fs and not sam_fs and not cov_fs and not cas_fs: BtLog.error('1') cov_libs = [bt.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \ [bt.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \ [bt.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \ [bt.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)] hit_libs = [bt.hitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs)] return title, fasta_f, fasta_type, cov_libs, hit_libs, taxrules, nodesDB_f, nodes_f, names_f, out_f
def check_input(args): rank = args['--rank'] c_index = args['--cindex'] multiplot = args['--multiplot'] sort_order = args['--sort'] sort_first = args['--sort_first'] taxrule = args['--taxrule'] hist_type = args['--hist'] catcolour_f = args['--catcolour'] cumulative_flag = args['--cumulative'] #Convert sort_first to a list if sort_first: args['--sort_first'] = sort_first.split(',') else: args['--sort_first'] = () if 'blobplot' in args or 'covplot' in args: # Are ranks sane ? if rank not in BtTax.RANKS: BtLog.error('9', rank) # is taxrule provided? if taxrule not in BtTax.TAXRULES: BtLog.error('8', taxrule) # Are sort_order and hist_type sane? if not sort_order in ['span', 'count']: BtLog.error('14', sort_order) if not hist_type in ['span', 'count']: BtLog.error('15', hist_type) if (catcolour_f) and (c_index): BtLog.error('24') if (cumulative_flag) and (multiplot): BtLog.error('32') return args
def main(): #main_dir = dirname(__file__) args = docopt(__doc__) fasta_f = args['--infile'] fasta_type = args['--type'] bam_fs = args['--bam'] cov_fs = args['--cov'] cas_fs = args['--cas'] hit_fs = args['--hitsfile'] prefix = args['--out'] nodesDB_f = args['--db'] names_f = args['--names'] estimate_cov_flag = True if not args['--calculate_cov'] else False nodes_f = args['--nodes'] taxrules = args['--taxrule'] try: min_bitscore_diff = float(args['--min_diff']) min_score = float(args['--min_score']) except ValueError(): BtLog.error('45') tax_collision_random = args['--tax_collision_random'] title = args['--title'] # outfile out_f = BtIO.getOutFile("blobDB", prefix, "json") if not (title): title = out_f # coverage if not (fasta_type) and not bam_fs and not cov_fs and not cas_fs: BtLog.error('1') cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \ [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \ [BtCore.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)] # taxonomy hit_libs = [ BtCore.HitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs) ] # Create BlobDB object blobDb = BtCore.BlobDb(title) blobDb.version = interface.__version__ # Parse FASTA blobDb.parseFasta(fasta_f, fasta_type) # Parse nodesDB OR names.dmp, nodes.dmp nodesDB_default = join(dirname(abspath(__file__)), "../data/nodesDB.txt") nodesDB, nodesDB_f = BtIO.parseNodesDB(nodes=nodes_f, names=names_f, nodesDB=nodesDB_f, nodesDBdefault=nodesDB_default) blobDb.nodesDB_f = nodesDB_f # Parse similarity hits if (hit_libs): blobDb.parseHits(hit_libs) if not taxrules: if len(hit_libs) > 1: taxrules = ['bestsum', 'bestsumorder'] else: taxrules = ['bestsum'] blobDb.computeTaxonomy(taxrules, nodesDB, min_score, min_bitscore_diff, tax_collision_random) else: print(BtLog.warn_d['0']) # Parse coverage blobDb.parseCoverage(covLibObjs=cov_libs, estimate_cov=estimate_cov_flag, prefix=prefix) # Generating BlobDB and writing to file print(BtLog.status_d['7'] % out_f) BtIO.writeJson(blobDb.dump(), out_f)
def parseNodesDB(**kwargs): ''' Parsing names.dmp and nodes.dmp into the 'nodes_db' dict of dicts that gets JSON'ed into blobtools/data/nodes_db.json if this file does not exist. Nodes_db.json is used if neither "--names" and "--nodes" nor "--db" is specified. ''' nodesDB = {} names_f = kwargs['names'] nodes_f = kwargs['nodes'] nodesDB_f = kwargs['nodesDB'] nodesDB_default = kwargs['nodesDBdefault'] if (nodes_f and names_f): if not isfile(names_f): BtLog.error('0', names_f) if not isfile(nodes_f): BtLog.error('0', nodes_f) print BtLog.status_d['3'] % (nodes_f, names_f) try: nodesDB = readNamesNodes(names_f, nodes_f) except: BtLog.error('3', nodes_f, names_f) elif (nodesDB_f): if not isfile(nodesDB_f): BtLog.error('0', nodesDB_f) print BtLog.status_d['4'] % (nodesDB_f) try: nodesDB = readNodesDB(nodesDB_f) except: BtLog.error('27', nodesDB_f) elif (nodesDB_default): if not isfile(nodesDB_default): BtLog.error('28') print BtLog.status_d['4'] % (nodesDB_default) try: nodesDB = readNodesDB(nodesDB_default) except: BtLog.error('27', nodesDB_default) nodesDB_f = nodesDB_default # Write nodesDB if not available if not isfile(nodesDB_default): writeNodesDB(nodesDB, nodesDB_default) return nodesDB, nodesDB_f
def parseBamForFilter(infile, include_unmapped, outfile, include, exclude, gzip, do_sort, keep_sorted, sort_threads): ''' parse BAM to extract readpairs ''' if not isfile(infile): BtLog.error('0', infile) if do_sort: command = blobtools.SAMTOOLS + ' sort -@ sort_threads -n -O bam -T temp -o %s.readsorted.bam %s' % ( infile, infile) runCmd(command=command, wait=True) infile = "%s.readsorted.bam" % infile progress_unit = int(100000) #if progress_flag: # reads_total, reads_mapped = checkBam(infile) command = blobtools.SAMTOOLS + " view -f 1 -F 256 -F 2048 %s" % infile pair_count_by_type, pair_seqs_by_type, out_fs_by_type = init_read_pairs( outfile, include_unmapped, include, exclude) if include: sequence_to_type_dict = defaultdict(lambda: 'Ex') for incl in include: sequence_to_type_dict[incl] = 'In' sequence_to_type_dict['*'] = 'Un' elif exclude: sequence_to_type_dict = defaultdict(lambda: 'In') for excl in exclude: sequence_to_type_dict[excl] = 'Ex' sequence_to_type_dict['*'] = 'Un' else: sequence_to_type_dict = defaultdict(lambda: 'In') sequence_to_type_dict['*'] = 'Un' iterator = '' read_pair_type = None iterator = runCmd(command=command) seen_reads = 0 sam_lines = [] print BtLog.status_d['26'] % infile for sam_line in iterator: sam_lines.append(sam_line) print BtLog.status_d['22'] % infile reads_total = len(sam_lines) for i in xrange(0, len(sam_lines), 2): read1 = sam_lines[i].split() try: seen_reads += 2 read2 = sam_lines[i + 1].split() read_pair_type = "".join( sorted([ sequence_to_type_dict[read1[2]], sequence_to_type_dict[read2[2]] ])) BtLog.progress(seen_reads, progress_unit, reads_total) if read_pair_type in pair_seqs_by_type: #pair_seqs_by_type[read_pair_type] += get_read_pair_seqs(read1, read2) pair_seqs_by_type[read_pair_type].append( get_read_pair_seqs(read1, read2)) pair_count_by_type[read_pair_type] += 1 except IndexError: print BtLog.warn_d['11'] #print_bam(read_pair_out_fs, read_pair_type, read1, read2) # this prints SAM files for debugging if not seen_reads == reads_total: BtLog.progress(reads_total, progress_unit, reads_total) write_read_pair_seqs(pair_count_by_type, pair_seqs_by_type, out_fs_by_type) # info log info_string = [] info_string.append(('Total pairs', "{:,}".format(int(seen_reads / 2)), '{0:.1%}'.format(1.00))) for read_pair_type, count in pair_count_by_type.items(): info_string.append((read_pair_type + ' pairs', "{:,}".format(count), '{0:.1%}'.format(count / int(seen_reads / 2)))) info_out_f = getOutFile(outfile, None, "info.txt") with open(info_out_f, 'w') as info_fh: print BtLog.status_d['24'] % info_out_f info_fh.write(get_table(info_string)) if do_sort and not keep_sorted: os.remove(infile) return 1
taxrule = args['--taxrule'] hist_type = args['--hist'] plot_title = args['--title'] ignore_contig_length = args['--noscale'] #labels = args['--label'] #colour_f = args['--colours'] #exclude_groups = args['--exclude'] format = args['--format'] #no_plot_blobs = args['--noblobs'] #no_plot_reads = args['--noreads'] #refcov_f = args['--refcov'] #catcolour_f = args['--catcolour'] # Does blobdb_f exist ? if not isfile(blobdb_f): BtLog.error('0', blobdb_f) # Does cov_f exist ? if not isfile(cov_f): BtLog.error('0', cov_f) # parse cov file in dict cov_dict = BtPlot.parseCovFile(cov_f) # Are ranks sane ? if rank not in RANKS: BtLog.error('9', rank) # Are sort_order and hist_type sane? if not sort_order in ['span', 'count']: BtLog.error('14', sort_order) if not hist_type in ['span', 'count']:
def plotScatter(self, cov_lib, info_flag, out_f): fig, axScatter, axHistx, axHisty, axLegend, top_bins, right_bins = self.setupPlot(self.plot) # empty handles for big legend legend_handles = [] legend_labels = [] # marker size scaled by biggest blob (size in points^2) max_length = max(array(self.stats['all']['length'])) # length of biggest blob max_marker_size = 12500 # marker size for biggest blob, i.e. area of 12500^2 pixel for idx, group in enumerate(self.plot_order): idx += 1 lw, alpha = 0.5, 0.8 if group == 'no-hit': alpha = 0.5 group_length_array = array(self.stats[group]['length']) if len(group_length_array) > 0 and group not in self.exclude_groups: colour = self.colours[group] group_x_array = '' group_y_array = '' if self.plot == 'blobplot': group_x_array = array(self.stats[group]['gc']) group_y_array = array(self.stats[group]['covs'][cov_lib]) elif self.plot == 'covplot': group_x_array = array(self.stats[group]['covs'][cov_lib]) group_y_array = array([self.cov_y_dict.get(name, 0.02) for name in self.stats[group]['name']]) else: BtLog.error('34', self.plot) marker_size_array = [] if (self.ignore_contig_length): # no scaling if group == "no-hit": s = 20 else: s = 100 marker_size_array = [s for length in group_length_array] else: # scaling by max_length marker_size_array = [(length/max_length)*max_marker_size for length in group_length_array] # generate label for legend group_span_in_mb = round(self.stats[group]['span_visible']/1000000, 2) group_number_of_seqs = self.stats[group]['count_visible'] group_n50 = self.stats[group]['n50'] fmt_seqs = "{:,}".format(group_number_of_seqs) fmt_span = "{:,}".format(group_span_in_mb) fmt_n50 = "{:,}".format(group_n50) label = "%s (%s;%sMB;%snt)" % (group, fmt_seqs, fmt_span, fmt_n50) if (info_flag): print(BtLog.info_d['0'] % (group, fmt_seqs, fmt_span, fmt_n50)) if group == "other": legend_handles.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markeredgecolor=DGREY, markerfacecolor=colour)) else: legend_handles.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markeredgecolor=WHITE, markerfacecolor=colour)) legend_labels.append(label) weights_array = None if self.hist_type == "span": weights_array = group_length_array/1000 axHistx.hist(group_x_array, weights=weights_array, color = colour, bins = top_bins, histtype='step', lw = 3) axHisty.hist(group_y_array, weights=weights_array, color = colour, bins = right_bins, histtype='step', orientation='horizontal', lw = 3) if group == 'other': axScatter.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=DGREY, label=label) else: axScatter.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=WHITE, label=label) axLegend.axis('off') if (self.multiplot): fig_m, axScatter_m, axHistx_m, axHisty_m, axLegend_m, top_bins, right_bins = self.setupPlot(self.plot) legend_handles_m = [] legend_labels_m = [] legend_handles_m.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markeredgecolor=WHITE, markerfacecolor=colour)) legend_labels_m.append(label) axHistx_m.hist(group_x_array, weights=weights_array, color = colour, bins = top_bins, histtype='step', lw = 3) axHisty_m.hist(group_y_array, weights=weights_array, color = colour, bins = right_bins, histtype='step', orientation='horizontal', lw = 3) if group == 'other': axScatter_m.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=DGREY, label=label) else: axScatter_m.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=WHITE, label=label) axLegend_m.axis('off') axLegend_m.legend(legend_handles_m, legend_labels_m, loc=6, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True) plot_ref_legend(axScatter_m, max_length, max_marker_size, self.ignore_contig_length) m_out_f = "%s.%s.%s.%s" % (out_f, cov_lib, idx, group.replace("/", "_").replace(" ", "_")) fig_m = plot_legend(fig_m, axLegend_m, m_out_f, self.legend_flag, self.format, self.cumulative_flag) print(BtLog.status_d['8'] % "%s.%s" % (m_out_f, self.format)) fig_m.savefig("%s.%s" % (m_out_f, self.format), format=self.format) plt.close(fig_m) elif (self.cumulative_flag): axLegend.legend(legend_handles, legend_labels, loc=6, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True) plot_ref_legend(axScatter, max_length, max_marker_size, self.ignore_contig_length) m_out_f = "%s.%s.%s.%s" % (out_f, cov_lib, idx, group.replace("/", "_").replace(" ", "_")) fig.add_axes(axLegend) fig = plot_legend(fig, axLegend, m_out_f, self.legend_flag, self.format, self.cumulative_flag) if not (self.no_title): fig.suptitle(out_f, fontsize=35, verticalalignment='top') print(BtLog.status_d['8'] % "%s.%s" % (m_out_f, self.format)) fig.savefig("%s.%s" % (m_out_f, self.format), format=self.format) else: pass plot_ref_legend(axScatter, max_length, max_marker_size, self.ignore_contig_length) axLegend.legend(legend_handles, legend_labels, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True, loc=6 ) out_f = "%s.%s" % (out_f, cov_lib) fig.add_axes(axLegend) fig = plot_legend(fig, axLegend, out_f, self.legend_flag, self.format, self.cumulative_flag) if not (self.no_title): fig.suptitle(out_f, fontsize=35, verticalalignment='top') print(BtLog.status_d['8'] % "%s.%s" % (out_f, self.format)) fig.savefig("%s.%s" % (out_f, self.format), format=self.format) plt.close(fig)
if (out_f): out_f = "%s.%s" % (out_f, "BlobDB.json") else: out_f = "%s" % ("BlobDB.json") nodesDB_f = args['--db'] names_f = args['--names'] nodes_f = args['--nodes'] taxrules = args['--taxrule'] title = args['--title'] if (args['--title']) else os.path.basename(".".join(fasta_f.split('.')[0:-1])) # Do files exist ? files = [x for x in list([fasta_f] + sam_fs + bam_fs + cov_fs + cas_fs + [names_f] + [nodes_f] + hit_fs) if x is not None] for f in files: if not os.path.isfile(f): BtLog.error('0', f) # Is taxonomy provided? if nodesDB_f == "data/nodesDB.txt": nodesDB_f = os.path.join(main_dir, nodesDB_f) if not os.path.isfile(nodesDB_f) and not ((names_f) and (nodes_f)): BtLog.error('3') if not (hit_fs): BtLog.error('18') # can FASTA parser deal with assemblies if not fasta_type in ASSEMBLY_TYPES: BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:])) # Is coverage provided?
TAXRULES = ['bestsum', 'bestsumorder'] RANKS = ['species', 'genus', 'family', 'order', 'phylum', 'superkingdom', 'all'] main_dir = dirname(__file__) #print data_dir args = docopt(__doc__) blobdb_f = args['--input'] out_f = args['--out'] ranks = args['--rank'] taxrule = args['--taxrule'] hits_flag = args['--hits'] seq_list = args['--list'] # Does blobdb_f exist ? if not isfile(blobdb_f): BtLog.error('0', blobdb_f) # Are ranks sane ? for rank in ranks: if rank not in RANKS: BtLog.error('9', rank) if 'all' in ranks: ranks = RANKS[0:-1] # Is list a list of sequence names or a file? seqs = [] if (seq_list): if isfile(seq_list): seqs = BtIO.parseList(seq_list) elif "," in seq_list: seqs = seq_list.split(",")