def parseCas(infile, order_of_blobs): if not isfile(infile): BtLog.error('0', infile) seqs_total, reads_total, reads_mapped = checkCas(infile) progress_unit = int(len(order_of_blobs) / 100) cas_line_re = re.compile( r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})") command = "clc_mapping_info -n " + infile cov_dict = {} read_cov_dict = {} seqs_parsed = 0 if (runCmd(command=command)): for line in runCmd(command=command): cas_line_match = cas_line_re.search(line) if cas_line_match: idx = int(cas_line_match.group( 1)) - 1 # -1 because index of contig list starts with zero try: name = order_of_blobs[idx] reads = int(cas_line_match.group(3)) cov = float(cas_line_match.group(6)) cov_dict[name] = cov read_cov_dict[name] = reads seqs_parsed += 1 except: pass BtLog.progress(seqs_parsed, progress_unit, seqs_total) return cov_dict, reads_total, reads_mapped, read_cov_dict
def subselect_cov_libs(self, cov_lib_dict, cov_lib_selection): selected_cov_libs = [] cov_lib_selection_error = 0 if (cov_lib_selection): if cov_lib_selection == 'covsum': selected_cov_libs.append('covsum') elif "," in cov_lib_selection: selected_cov_libs = cov_lib_selection.split(",") if not set(selected_cov_libs).issubset(set( cov_lib_dict.keys())): cov_lib_selection_error = 1 else: selected_cov_libs.append(cov_lib_selection) if not cov_lib_selection in cov_lib_dict: cov_lib_selection_error = 1 else: selected_cov_libs = cov_lib_dict.keys() if cov_lib_selection_error: covlib_string = [] for covlib in cov_lib_dict: cov_lib_f = cov_lib_dict[covlib]['f'] if not cov_lib_f: cov_lib_f = "sum of coverages from all covlibs" covlib_string.append("\t\t%s : %s" % (covlib, cov_lib_f)) BtLog.error('33', "\n".join(covlib_string)) return selected_cov_libs
def parseCovFromHeader(fasta_type, header): ''' Returns the coverage from the header of a FASTA sequence depending on the assembly type ''' ASSEMBLY_TYPES = [None, 'spades', 'velvet', 'platanus'] if not fasta_type in ASSEMBLY_TYPES: BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:])) if fasta_type == 'spades': spades_match_re = re.compile(r"_cov_(\d+\.*\d*)") cov = re.findall(r"_cov_(\d+\.*\d*)", header) return float(spades_match_re.findall(header)[0]) elif fasta_type == 'velvet': return float(header.split("_")[-1]) #elif fasta_type == 'abyss' or fasta_type == 'soap': # temp = header.split(" ") # return float(temp[2]/(temp[1]+1-75)) elif fasta_type == 'platanus': temp = header.rstrip("\n").split("_") if len(temp) >= 3: return float(temp[2].replace("cov", "")) # scaffold/scaffoldBubble/contig else: return float(temp[1].replace("cov", "")) # gapClosed else: pass
def parseJson(infile): '''http://artem.krylysov.com/blog/2015/09/29/benchmark-python-json-libraries/''' if not isfile(infile): BtLog.error('0', infile) import time start = time.time() json_parser = '' with open(infile, 'r') as fh: print BtLog.status_d['15'] json_string = fh.read() try: import ujson as json # fastest json_parser = 'ujson' print BtLog.status_d['16'] % json_parser except ImportError: try: import simplejson as json # fast json_parser = 'simplejson' except ImportError: import json # default json_parser = 'json' print BtLog.status_d['17'] % json_parser try: obj = json.loads(json_string.decode("ascii")) except ValueError: BtLog.error('37', infile, "BlobDB") data = byteify(obj) print BtLog.status_d['20'] % (time.time() - start) return data
def set_format_scatterplot(axScatter, **kwargs): min_x, max_x = None, None min_y, max_y = None, None if kwargs['plot'] == 'blobplot': min_x, max_x = 0, 1 major_xticks = MultipleLocator(0.2) minor_xticks = AutoMinorLocator(20) min_y, max_y = kwargs['min_cov'] * 0.1, kwargs['max_cov'] + 100 axScatter.set_yscale('log') axScatter.set_xscale('linear') axScatter.xaxis.set_major_locator(major_xticks) axScatter.xaxis.set_minor_locator(minor_xticks) elif kwargs['plot'] == 'covplot': min_x, max_x = kwargs['min_cov'] * 0.1, kwargs['max_cov'] + 100 min_y, max_y = kwargs['min_cov'] * 0.1, kwargs['max_cov'] + 100 axScatter.set_yscale('log') axScatter.set_xscale('log') else: BtLog.error('34' % kwargs['plot']) axScatter.set_xlim((min_x, max_x)) axScatter.set_ylim( (min_y, max_y) ) # This sets the max-Coverage so that all libraries + sum are at the same scale axScatter.grid(True, which="major", lw=2., color=WHITE, linestyle='-') axScatter.set_axisbelow(True) axScatter.xaxis.labelpad = 20 axScatter.yaxis.labelpad = 20 axScatter.yaxis.get_major_ticks()[0].label1.set_visible(False) axScatter.tick_params(axis='both', which='both', direction='out') return axScatter
def checkBam(infile): print BtLog.status_d['10'] if not isfile(infile): BtLog.error('0', infile) reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped") #reads_secondary_re = re.compile(r"(\d+)\s\+\s\d+\ssecondary") #reads_supplementary_re = re.compile(r"(\d+)\s\+\s\d+\ssupplementary") reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total") reads_total, reads_mapped = 0, 0 output = '' command = blobtools.SAMTOOLS + " flagstat " + infile for line in runCmd(command=command): output += line reads_mapped = int(reads_mapped_re.search(output).group(1)) #reads_secondary = int(reads_secondary_re.search(output).group(1)) #reads_supplementary = int(reads_supplementary_re.search(output).group(1)) #reads_mapped = reads_mapped - reads_secondary - reads_supplementary reads_total = int(reads_total_re.search(output).group( 1)) # - reads_secondary - reads_supplementary # check whether there are reads in BAM if not reads_total or not reads_mapped: BtLog.error('29', infile) print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), \ '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total)) return reads_total, reads_mapped
def parseFasta(self, fasta_f, fasta_type): print BtLog.status_d['1'] % ('FASTA', fasta_f) self.assembly_f = abspath(fasta_f) if (fasta_type): # Set up CovLibObj for coverage in assembly header self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type, fasta_f) for name, seq in BtIO.readFasta(fasta_f): blObj = BlObj(name, seq) if not blObj.name in self.dict_of_blobs: self.seqs += 1 self.length += blObj.length self.n_count += blObj.n_count if (fasta_type): cov = BtIO.parseCovFromHeader(fasta_type, blObj.name) self.covLibs[fasta_type].cov_sum += cov blObj.addCov(fasta_type, cov) self.order_of_blobs.append(blObj.name) self.dict_of_blobs[blObj.name] = blObj else: BtLog.error('5', blObj.name) if self.seqs == 0 or self.length == 0: BtLog.error('1')
def readTax(infile, set_of_blobs): ''' If more fields need to be parsed: - change hit_line_re - catch matches in variables - add as key-value pairs to hitDict ''' hit_line_re = re.compile(r"^(\S+)\s+(\d+)[\;?\d+]*\s+(\d+\.*\d*)" ) # TEST TEST , if not split it afterwards with open(infile) as fh: for line in fh: match = hit_line_re.search(line) if match: hitDict = { 'name': match.group(1), 'taxId': match.group( 2 ), # string because if int, conversion is a nightmare ... 'score': float(match.group(3)) } if hitDict['name'] not in set_of_blobs: BtLog.error('19', hitDict['name'], infile) if hitDict['taxId'] == 'N/A': BtLog.error('22', infile) yield hitDict
def parseList(infile): if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: items = [] for l in fh: items.append(l.rstrip("\n")) return items
def parseSet(infile): if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: items = set() for l in fh: items.add(l.rstrip("\n").lstrip(">")) return items
def readYaml(infile): if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: str = "".join(fh.readlines()) try: data = yaml.load(str) except yaml.YAMLError, exc: BtLog.error('37', infile, "yaml")
def parseCatColour(catcolour_f): catcolour_dict = {} with open(catcolour_f) as fh: for l in fh: try: seq_name, category = l.rstrip("\n").split(",") catcolour_dict[seq_name] = category except: BtLog.error('23', catcolour_f) return catcolour_dict
def parseColours(infile): items = {} if infile: if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: for l in fh: temp = l.rstrip("\n").split(",") items[temp[0]] = temp[1] return items
def parseBam(infile, set_of_blobs, no_base_cov_flag): ''' checkBam returns reads_total and reads_mapped base_cov_dict is list of coverages for each contigs, since list appending should be faster ''' if not isfile(infile): BtLog.error('0', infile) reads_total, reads_mapped = checkBam(infile) progress_unit = int(reads_mapped / 1000) base_cov_dict = {blob: [] for blob in set_of_blobs} #base_cov_dict = {blob : 0 for blob in set_of_blobs} read_cov_dict = {blob: 0 for blob in set_of_blobs} cigar_match_re = re.compile( r"(\d+)M|X|=") # only gets digits before M,X,='s # execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment) command = blobtools.SAMTOOLS + " view -F 1024 -F 4 -F 256 " + infile seen_reads = 0 #import time #start = time.time() if not (no_base_cov_flag): for line in runCmd(command=command): seen_reads += 1 match = line.split() try: base_cov_dict[match[2]].append( sum([ int(matching) for matching in cigar_match_re.findall(match[5]) ])) #base_cov_dict[match[2]] += sum([int(matching) for matching in cigar_match_re.findall(match[5])]) read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) BtLog.progress(seen_reads, progress_unit, reads_mapped) else: for line in runCmd(command=command): seen_reads += 1 match = line.split() try: read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) BtLog.progress(seen_reads, progress_unit, reads_mapped) if not int(reads_mapped) == int(seen_reads): print BtLog.warn_d['3'] % (reads_mapped, seen_reads) reads_mapped = seen_reads base_cov_dict = { seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items() } #end = time.time() #print (end-start) return base_cov_dict, reads_total, reads_mapped, read_cov_dict
def parseCov(infile, set_of_blobs): if not isfile(infile): BtLog.error('0', infile) old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)") base_cov_dict = {} cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)") reads_total = 0 reads_mapped = 0 reads_unmapped = 0 read_cov_dict = {} seqs_parsed = 0 progress_unit = 1 old_format = 1 with open(infile) as fh: for line in fh: if line.startswith("#"): old_format = 0 if old_format == 0: if line.startswith('#'): if line.startswith("## Total Reads"): reads_total = int(line.split(" = ")[1]) elif line.startswith("## Mapped Reads"): reads_mapped = int(line.split(" = ")[1]) elif line.startswith("## Unmapped Reads"): reads_unmapped = int(line.split(" = ")[1]) else: pass else: match = cov_line_re.search(line) if match: seqs_parsed += 1 name, read_cov, base_cov = match.group(1), int( match.group(2)), float(match.group(3)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name) else: read_cov_dict[name] = read_cov base_cov_dict[name] = base_cov else: match = old_cov_line_re.search(line) if match: seqs_parsed += 1 name, base_cov = match.group(1), float(match.group(2)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name) else: base_cov_dict[name] = base_cov BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs)) #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs)) return base_cov_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict
def parseDict(infile, key, value): items = {} if infile: if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: items = {} k_idx = int(key) v_idx = int(value) for l in fh: temp = l.rstrip("\n").split() items[temp[k_idx]] = temp[v_idx] return items
def parseCovFile(cov_f): cov_dict = {} with open(cov_f) as fh: for l in fh: try: seq_name, cov = l.rstrip("\n").split("\t") if float(cov) < 0.02: cov_dict[seq_name] = 0.02 else: cov_dict[seq_name] = float(cov) except: BtLog.error('25', cov_f) return cov_dict
def parseCatColour(infile): catcolour_dict = {} if infile: if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: for l in fh: try: seq_name, category = l.rstrip("\n").split(",") catcolour_dict[seq_name] = category except: BtLog.error('23', infile) return catcolour_dict
def parseRefCov(refcov_f): refcov_dict = {} with open(refcov_f) as fh: for l in fh: try: cov_lib, reads_total_ref, reads_mapped_ref = l.split(",") refcov_dict[cov_lib] = { 'reads_total' : int(reads_total_ref), 'reads_mapped' : int(reads_mapped_ref) } except: BtLog.error('21', refcov_f) return refcov_dict
def plotBar(self, cov_lib, out_f): fig, ax_main, ax_group, x_pos_main, x_pos_group = self.setupPlot('readcov') ax_main_data = {'labels' : [], 'values' : [], 'colours' : [] } ax_group_data = {'labels' : [], 'values' : [], 'colours' : [] } reads_total = self.cov_libs_total_reads_dict[cov_lib] reads_mapped = self.stats['all']['reads_mapped'][cov_lib] reads_unmapped = reads_total - self.stats['all']['reads_mapped'][cov_lib] ax_main_data['labels'].append('Unmapped (assembly)') ax_main_data['values'].append(reads_unmapped/reads_total) ax_main_data['colours'].append(DGREY) ax_main_data['labels'].append('Mapped (assembly)') ax_main_data['values'].append(reads_mapped/reads_total) ax_main_data['colours'].append(DGREY) if (self.refcov_dict): if cov_lib in self.refcov_dict: reads_total_ref = self.refcov_dict[cov_lib]['reads_total'] reads_mapped_ref = self.refcov_dict[cov_lib]['reads_mapped'] reads_unmapped_ref = reads_total_ref - reads_mapped_ref ax_main_data['labels'].append('Unmapped (ref)') ax_main_data['values'].append(reads_unmapped_ref/reads_total_ref) ax_main_data['colours'].append(DGREY) ax_main_data['labels'].append('Mapped (ref)') ax_main_data['values'].append(reads_mapped_ref/reads_total_ref) ax_main_data['colours'].append(DGREY) else: BtLog.error('40', cov_lib) # mapped plotted groups for group in self.plot_order: ax_group_data['labels'].append(group) ax_group_data['values'].append(self.stats[group]['reads_mapped_perc'][cov_lib]) ax_group_data['colours'].append(self.colours[group]) rect_group = ax_group.bar(x_pos_group, ax_group_data['values'], width = 0.5, tick_label=ax_group_data['labels'], align='center', color = ax_group_data['colours']) for rect_g in rect_group: height_g = float(rect_g.get_height()) ax_group.text(rect_g.get_x() + rect_g.get_width()/2., 0.005 + height_g, '{:.2f}%'.format(height_g*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE) rect_main = ax_main.bar(x_pos_main, ax_main_data['values'], width = 0.5, tick_label=ax_main_data['labels'], align='center', color = ax_main_data['colours']) for rect_m in rect_main: height_m = float(rect_m.get_height()) ax_main.text(rect_m.get_x() + rect_m.get_width()/2., 0.005 + height_m, '{:.2f}%'.format(height_m*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE) ax_main.set_xticklabels(ax_main_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE) ax_group.set_xticklabels(ax_group_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE) #figsuptitle = fig.suptitle(out_f, verticalalignment='top') out_f = "%s.read_cov.%s" % (out_f, cov_lib) print(BtLog.status_d['8'] % "%s.%s" % (out_f, self.format)) fig.tight_layout() #fig.savefig("%s.%s" % (out_f, self.format), format=self.format, bbox_extra_artists=(figsuptitle,)) fig.savefig("%s.%s" % (out_f, self.format), format=self.format) plt.close(fig)
def readFasta(infile): if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: header, seqs = '', [] for l in fh: if l[0] == '>': if header: yield header, ''.join(seqs).upper() header, seqs = l[1:-1].split()[0], [ ] # Header is split at first whitespace else: seqs.append(l[:-1]) yield header, ''.join(seqs).upper()
def checkBam(infile): print BtLog.status_d['10'] if not (which('samtools')): BtLog.error('7') reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped") reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total") reads_total, reads_mapped = 0, 0 output = '' command = "samtools flagstat " + infile for line in runCmd(command): output += line reads_mapped = int(reads_mapped_re.search(output).group(1)) reads_total = int(reads_total_re.search(output).group(1)) print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total)) return reads_total, reads_mapped
def parse_labels(labels): label_d = {} name, groups = '', '' if (labels): try: for label in labels: name, groups = str(label).split("=") if "," in groups: for group in groups.split(","): label_d[group] = name else: label_d[groups] = name except: BtLog.error('17', labels) return label_d
def parseCmdLabels(labels): label_d = {} name, groups = '', '' if labels: try: for label in labels: name, groups = str(label).split("=") if "," in groups: for group in groups.split(","): label_d[group] = name else: label_d[groups] = name except: BtLog.error('17', labels) return label_d
def main(): args = docopt(__doc__) #print(args) bam_f = args['--bam'] include_f = args['--include'] exclude_f = args['--exclude'] out_prefix = args['--out'] read_format = args['--read_format'] if not read_format in set(['fq', 'fa']): sys.exit("[X] Read format must be fq or fa!") noninterleaved = args['--noninterleaved'] include_unmapped = True if args['--exclude_unmapped']: include_unmapped = False out_f = BtIO.getOutFile(bam_f, out_prefix, None) if include_f and exclude_f: print(BtLog.error('43')) elif include_f: sequence_list = BtIO.parseList(include_f) BtIO.parseBamForFilter(bam_f, include_unmapped, noninterleaved, out_f, sequence_list, None, read_format) elif exclude_f: sequence_list = BtIO.parseList(exclude_f) BtIO.parseBamForFilter(bam_f, include_unmapped, noninterleaved, out_f, None, sequence_list, read_format) else: BtIO.parseBamForFilter(bam_f, include_unmapped, noninterleaved, out_f, None, None, read_format)
def parseReferenceCov(infile): refcov_dict = {} if infile: if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: for l in fh: try: cov_lib, reads_total_ref, reads_mapped_ref = l.split(",") refcov_dict[cov_lib] = { 'reads_total': int(reads_total_ref), 'reads_mapped': int(reads_mapped_ref) } except: BtLog.error('21', infile) return refcov_dict
def parseSam(infile, set_of_blobs, no_base_cov_flag): if not isfile(infile): BtLog.error('0', infile) base_cov_dict = {blob: [] for blob in set_of_blobs} read_cov_dict = {blob: 0 for blob in set_of_blobs} cigar_match_re = re.compile( r"(\d+)M|X|=") # only gets digits before M,X,='s reads_total = 0 reads_mapped = 0 if not (no_base_cov_flag): with open(infile) as fh: for line in fh: if line.startswith("@"): pass else: reads_total += 1 match = line.split() if not match[2] == '*': reads_mapped += 1 try: base_cov_dict[match[2]].append( sum([ int(matching) for matching in cigar_match_re.findall(match[5]) ])) read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) else: with open(infile) as fh: for line in fh: if line.startswith("@"): pass else: reads_total += 1 match = line.split() if not match[2] == '*': reads_mapped += 1 try: read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) base_cov_dict = { seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items() } return base_cov_dict, reads_total, reads_mapped, read_cov_dict
def main(): args = docopt(__doc__) fasta_f = args['--infile'] bam_fs = args['--bam'] cas_fs = args['--cas'] prefix = args['--output'] estimate_cov_flag = True if not args['--calculate_cov'] else False # Make covLibs cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \ [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] if not (cov_libs): BtLog.error('31') blobDb = BtCore.BlobDb('cov') blobDb.version = interface.__version__ blobDb.parseFasta(fasta_f, None) blobDb.parseCoverage(covLibObjs=cov_libs, estimate_cov=estimate_cov_flag, prefix=prefix)
def checkCas(infile): print BtLog.status_d['12'] if not (which('clc_mapping_info')): BtLog.error('20') seqs_total_re = re.compile(r"\s+Contigs\s+(\d+)") reads_total_re = re.compile(r"\s+Reads\s+(\d+)") reads_mapping_re = re.compile(r"\s+Mapped reads\s+(\d+)\s+(\d+.\d+)\s+\%") seqs_total, reads_total, reads_mapping, mapping_rate = 0, 0, 0, 0.0 output = '' command = "clc_mapping_info -s " + infile for line in runCmd(command): output += line seqs_total = int(seqs_total_re.search(output).group(1)) reads_mapped = int(reads_mapping_re.search(output).group(1)) reads_total = int(reads_total_re.search(output).group(1)) print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total)) return seqs_total, reads_total, reads_mapped
def checkBam(infile): print BtLog.status_d['10'] if not (which('samtools')): BtLog.error('7') reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped") reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total") reads_total, reads_mapped = 0, 0 output = '' command = "samtools flagstat " + infile for line in runCmd(command): output += line reads_mapped = int(reads_mapped_re.search(output).group(1)) reads_total = int(reads_total_re.search(output).group(1)) print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format( reads_mapped / reads_total)) return reads_total, reads_mapped
def main(): args = docopt(__doc__) fasta_f = args['--infile'] bam_fs = args['--bam'] cas_fs = args['--cas'] sam_fs = args['--sam'] prefix = args['--output'] no_base_cov_flag = args['--no_base_cov'] # Make covLibs cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \ [BtCore.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \ [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] if not (cov_libs): BtLog.error('31') blobDb = BtCore.BlobDb('cov') blobDb.version = blobtools.__version__ blobDb.parseFasta(fasta_f, None) blobDb.parseCoverage(covLibObjs=cov_libs, no_base_cov=no_base_cov_flag, prefix=prefix)
def checkCas(infile): print BtLog.status_d['12'] if not (which('clc_mapping_info')): BtLog.error('20') seqs_total_re = re.compile(r"\s+Contigs\s+(\d+)") reads_total_re = re.compile(r"\s+Reads\s+(\d+)") reads_mapping_re = re.compile(r"\s+Mapped reads\s+(\d+)\s+(\d+.\d+)\s+\%") seqs_total, reads_total, reads_mapping, mapping_rate = 0, 0, 0, 0.0 output = '' command = "clc_mapping_info -s " + infile for line in runCmd(command): output += line seqs_total = int(seqs_total_re.search(output).group(1)) reads_mapped = int(reads_mapping_re.search(output).group(1)) reads_total = int(reads_total_re.search(output).group(1)) print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format( reads_mapped / reads_total)) return seqs_total, reads_total, reads_mapped
def mapping(): out_f, hit_f, map_f, taxid_d = None, None, None, {} hit_f = megablast_output #hit file: BLAST similarity search result (TSV format) map_f = "/home/nancy/assembly_app/blobtools/blobtools-master/taxon_n" #mapping file (TSV format), in which one column lists a sequence ID (of a subject) and another the NCBI TaxID map_col_sseqid = "0" #column of mapping file containing sequence IDs (of the subject) map_col_taxid = "2" #column of mapping file containing the TaxID of the subject hit_col_qseqid = "0" #column of the hit file containing query ID hit_col_sseqid = "1" #column of the hit file containing subject ID hit_col_score = "11" #column of the hit file containing (bit)score try: hit_col_qseqid = int(hit_col_qseqid) hit_col_sseqid = int(hit_col_sseqid) hit_col_score = int(hit_col_score) except ValueError: BtLog.error('41' % ( "--hit_column_qseqid, --hit_column_sseqid and --hit_column_score" )) if map_f: if map_col_sseqid and map_col_taxid: try: map_col_sseqid = int(map_col_sseqid) map_col_taxid = int(map_col_taxid) except ValueError: BtLog.error('44') print BtLog.status_d['1'] % ("Mapping file", map_f) taxid_d = BtIO.parseDict(map_f, map_col_sseqid, map_col_taxid) out_f = BtIO.getOutFile("taxified", hit_f, "out") else: BtLog.error('44') else: BtLog.error('41') output = [] print BtLog.status_d['1'] % ("similarity search result", hit_f) with open(hit_f) as fh: for idx, line in enumerate(fh): col = line.rstrip("\n").split() qseqid = col[hit_col_qseqid] sseqid = col[hit_col_sseqid] score = col[hit_col_score] tax_id = None if sseqid not in taxid_d: BtLog.warn_d['12'] % (sseqid, map_f) tax_id = taxid_d.get(sseqid, "N/A") output.append("%s\t%s\t%s\t%s" % (qseqid, tax_id, score, sseqid)) if output: with open(out_f, "w") as fh: print BtLog.status_d['24'] % out_f fh.write("\n".join(output) + "\n")
def readTax(infile, set_of_blobs): ''' If more fields need to be parsed: - change hit_line_re - catch matches in variables - add as key-value pairs to hitDict ''' hit_line_re = re.compile(r"^(\S+)\s+(\d+)[\;?\d+]*\s+(\d+\.*\d*)") # TEST TEST , if not split it afterwards with open(infile) as fh: for line in fh: match = hit_line_re.search(line) if match: hitDict = { 'name' : match.group(1), 'taxId' : match.group(2), # string because if int, conversion is a nightmare ... 'score' : float(match.group(3)) } if hitDict['name'] not in set_of_blobs: BtLog.error('19', hitDict['name'], infile) if hitDict['taxId'] == 'N/A': BtLog.error('22', infile) yield hitDict
def parseCovFile(cov_f): cov_dict = {} old_format = 1 seq_name = '' cov = 0.0 with open(cov_f) as fh: for l in fh: if l.startswith("#"): old_format = 0 else: try: field = l.rstrip("\n").split("\t") if not (old_format): seq_name, cov = field[0], field[2] else: seq_name, cov = field[0], field[1] if float(cov) < 0.02: cov_dict[seq_name] = 0.02 else: cov_dict[seq_name] = float(cov) except: BtLog.error('25', cov_f) return cov_dict
def getNodesDB(**kwargs): ''' Parsing names.dmp and nodes.dmp into the 'nodes_db' dict of dicts that gets JSON'ed into blobtools/data/nodes_db.json if this file does not exist. This file is used if neither "--names" and "--nodes" nor "--db" is specified. ''' nodesDB = {} nodesDB_f = '' if (kwargs['names'] and kwargs['nodes']): print BtLog.status_d['3'] % (kwargs['nodes'], kwargs['names']) nodesDB = {} nodes_count = 0 with open(kwargs['nodes']) as fh: for line in fh: nodes_col = line.split("\t") node = {} node_id = nodes_col[0] node['parent'] = nodes_col[2] node['rank'] = nodes_col[4] nodesDB[node_id] = node nodes_count += 1 with open(kwargs['names']) as fh: for line in fh: names_col = line.split("\t") if names_col[6] == "scientific name": nodesDB[names_col[0]]['name'] = names_col[2] nodesDB_f = kwargs['nodesDB'] nodesDB['nodes_count'] = nodes_count elif(kwargs['nodesDB']): print BtLog.status_d['4'] % (kwargs['nodesDB']) nodesDB = readNodesDB(kwargs['nodesDB']) nodesDB_f = kwargs['nodesDB'] else: BtLog.error('3') return nodesDB, nodesDB_f
for name in readFasta(infile): fasta_order.append(name) fasta_dict[name] = 0.0 return fasta_dict, fasta_order if __name__ == '__main__': main_dir = dirname(__file__) #print data_dir args = docopt(__doc__) assembly_f = args['--infile'] cov_fs = args['--cov'] fasta_dict = {} fasta_order = [] if not isfile(assembly_f): BtLog.error('0', assembly_f) else: fasta_dict, fasta_order = parseFasta(assembly_f) for cov_f in cov_fs: if not isfile(cov_f): BtLog.error('0', cov_f) else: lib_cov_dict = BtPlot.parseCovFile(cov_f) for name in fasta_order: fasta_dict[name] = fasta_dict.get(name, 0.0) + lib_cov_dict[name] for name in fasta_order: print "%s\t%s" % (name, fasta_dict[name])
TAXRULES = ['bestsum', 'bestsumorder'] RANKS = ['species', 'genus', 'family', 'order', 'phylum', 'superkingdom', 'all'] main_dir = dirname(__file__) #print data_dir args = docopt(__doc__) blobdb_f = args['--input'] out_f = args['--out'] ranks = args['--rank'] taxrule = args['--taxrule'] hits_flag = args['--hits'] seq_list = args['--list'] # Does blobdb_f exist ? if not isfile(blobdb_f): BtLog.error('0', blobdb_f) # Are ranks sane ? for rank in ranks: if rank not in RANKS: BtLog.error('9', rank) if 'all' in ranks: ranks = RANKS[0:-1] # Is list a list of sequence names or a file? seqs = [] if (seq_list): if isfile(seq_list): seqs = BtIO.parseList(seq_list) elif "," in seq_list: seqs = seq_list.split(",")
def validate_input_create(main_dir, args): ''' Accepts: - main_dir - docopt args Returns: - title - fasta_f - fasta_type - cov_libs - hit_libs - nodesDB_f - taxrules - out_f ''' ASSEMBLY_TYPES = [None, 'spades', 'soap', 'abyss', 'velvet'] fasta_f = args['--infile'] fasta_type = args['--type'] sam_fs = args['--sam'] bam_fs = args['--bam'] cov_fs = args['--cov'] cas_fs = args['--cas'] hit_fs = args['--taxfile'] out_f = args['--out'] if (out_f): out_f = "%s.%s" % (os.path.basename(out_f), "BlobDB.json") else: out_f = "%s" % ("BlobDB.json") nodesDB_f = args['--db'] names_f = args['--names'] nodes_f = args['--nodes'] taxrules = args['--taxrule'] title = args['--title'] if (args['--title']) else out_f # Do files exist ? files = [x for x in list([fasta_f] + sam_fs + bam_fs + cov_fs + cas_fs + [names_f] + [nodes_f] + hit_fs) if x is not None] for f in files: if not os.path.isfile(f): BtLog.error('0', f) # Is taxonomy provided? if nodesDB_f == "data/nodesDB.txt": nodesDB_f = os.path.join(main_dir, nodesDB_f) if not os.path.isfile(nodesDB_f) and not ((names_f) and (nodes_f)): BtLog.error('3') if not (hit_fs): BtLog.error('18') # can FASTA parser deal with assemblies if not fasta_type in ASSEMBLY_TYPES: BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:])) # Is coverage provided? if not (fasta_type) and not bam_fs and not sam_fs and not cov_fs and not cas_fs: BtLog.error('1') cov_libs = [bt.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \ [bt.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \ [bt.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \ [bt.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)] hit_libs = [bt.hitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs)] return title, fasta_f, fasta_type, cov_libs, hit_libs, taxrules, nodesDB_f, nodes_f, names_f, out_f
taxrule = args['--taxrule'] hist_type = args['--hist'] plot_title = args['--title'] ignore_contig_length = args['--noscale'] #labels = args['--label'] #colour_f = args['--colours'] #exclude_groups = args['--exclude'] format = args['--format'] #no_plot_blobs = args['--noblobs'] #no_plot_reads = args['--noreads'] #refcov_f = args['--refcov'] #catcolour_f = args['--catcolour'] # Does blobdb_f exist ? if not isfile(blobdb_f): BtLog.error('0', blobdb_f) # Does cov_f exist ? if not isfile(cov_f): BtLog.error('0', cov_f) # parse cov file in dict cov_dict = BtPlot.parseCovFile(cov_f) # Are ranks sane ? if rank not in RANKS: BtLog.error('9', rank) # Are sort_order and hist_type sane? if not sort_order in ['span', 'count']: BtLog.error('14', sort_order) if not hist_type in ['span', 'count']:
if (out_f): out_f = "%s.%s" % (out_f, "BlobDB.json") else: out_f = "%s" % ("BlobDB.json") nodesDB_f = args['--db'] names_f = args['--names'] nodes_f = args['--nodes'] taxrules = args['--taxrule'] title = args['--title'] if (args['--title']) else os.path.basename(".".join(fasta_f.split('.')[0:-1])) # Do files exist ? files = [x for x in list([fasta_f] + sam_fs + bam_fs + cov_fs + cas_fs + [names_f] + [nodes_f] + hit_fs) if x is not None] for f in files: if not os.path.isfile(f): BtLog.error('0', f) # Is taxonomy provided? if nodesDB_f == "data/nodesDB.txt": nodesDB_f = os.path.join(main_dir, nodesDB_f) if not os.path.isfile(nodesDB_f) and not ((names_f) and (nodes_f)): BtLog.error('3') if not (hit_fs): BtLog.error('18') # can FASTA parser deal with assemblies if not fasta_type in ASSEMBLY_TYPES: BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:])) # Is coverage provided?