def MLSTdb(args): params = getParams(args) database, refset, alleleFasta, refstrain, max_iden, min_iden, coverage, paralog, relaxEnd = params[ 'database'], params['refset'], params['alleleFasta'], params[ 'refstrain'], params['max_iden'], params['min_iden'], params[ 'coverage'], params['paralog'], params['relaxEnd'] if os.path.isfile(alleleFasta): alleles = readFasta(uopen(alleleFasta)) else: alleles = readFasta(StringIO(alleleFasta)) alleles = [allele for allele in alleles \ if allele['value_id'].isdigit() and int(allele['value_id']) > 0 and allele['fieldname'].find('/') < 0] refAlleles = '' if refset is not None: if refstrain: if os.path.isfile(refstrain): references = readFasta(uopen(refstrain)) else: references = readFasta(StringIO(refstrain)) else: loci, references = {}, [] for allele in alleles: if allele['fieldname'] not in loci: loci[allele['fieldname']] = 1 references.append(allele) allele_text, refAlleles = buildReference(alleles, references, max_iden, min_iden, coverage, paralog, relaxEnd) if refset: with open(str(refset), 'w') as fout: fout.write(refAlleles + '\n') logger('A file of reference alleles has been generated: {0}'.format( refset)) if database: conversion = [[], []] with open(database, 'w') as fout: for allele in alleles: conversion[0].append(get_md5(allele['value'])) conversion[1].append( [allele['fieldname'], int(allele['value_id'])]) conversion = pd.DataFrame(conversion[1], index=conversion[0]) conversion.to_csv(database, header=False) logger('A lookup table of all alleles has been generated: {0}'.format( database)) return allele_text, refAlleles
def get_allele_info(allele_file): if os.path.isfile(allele_file + '.stat'): return json.load(open(allele_file + '.stat')) alleles = readFasta(allele_file) allele_aa = transeq(alleles) allele_stat = {} for n, s in alleles.items(): locus, allele_id = n.rsplit('_', 1) if locus not in allele_stat: allele_stat[locus] = {} if len(s) % 3 > 0: pseudo = 2 # frameshift else: aa = allele_aa.get(n + '_1', 'A') if aa[:-1].find('X') >= 0: pseudo = 3 # premature elif s[:3] not in ('ATG', 'GTG', 'TTG'): pseudo = 4 # no start elif aa[-1] != 'X': pseudo = 5 # no stop else: pseudo = 6 # intact allele_stat[locus][ allele_id] = int(allele_id) * 1000000 + len(s) * 10 + pseudo json.dump(allele_stat, open(allele_file + '.stat', 'w')) return allele_stat
def do_polish_with_SNPs(self, reference, snp_file) : sequence = readFasta(reference) snps = { n:[] for n in sequence } if snp_file != '' : with open(snp_file) as fin : for line in fin : part = line.strip().split() snps[part[0]].append([int(part[1]), part[-1]]) self.snps = snps for n, s in sequence.items() : sequence[n] = list(s) for cont, sites in snps.items() : for site,base in reversed(sites) : if base.startswith('+') : sequence[cont][site-1:site-1] = base[1:] elif base.startswith('-') : sequence[cont][site-1:(site+len(base)-2)] = [] else : sequence[cont][site-1] = base with open('{0}.fasta'.format(prefix), 'w') as fout : for n, s in sorted(sequence.items()) : s = ''.join(s) fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[site:(site+100)] for site in xrange(0, len(s), 100)]))) return '{0}.fasta'.format(prefix)
def addGenes(genes, gene_file): for gfile in gene_file.split(','): if gfile == '': continue gprefix = gfile.split('.')[0] ng = readFasta(gfile) for name in ng: s = ng[name] if checkCDS(name, s): genes['{0}:{1}'.format(gprefix, name)] = [ gfile, '', 0, 0, '+', int(hashlib.sha1(s.encode('utf-8')).hexdigest(), 16), s ] return genes
def cgMLST(allele_profile, allele_file): def get_allele_info(alleles): allele_aa = transeq(alleles) allele_stat = {} for n, s in alleles.iteritems(): locus, allele_id = n.rsplit('_', 1) if locus not in allele_stat: allele_stat[locus] = {} if len(s) % 3 > 0: pseudo = 2 # frameshift else: aa = allele_aa.get(n + '_1', 'A') if aa[:-1].find('X') >= 0: pseudo = 3 # premature elif s[:3] not in ('ATG', 'GTG', 'TTG'): pseudo = 4 # no start elif aa[-1] != 'X': pseudo = 5 # no stop else: pseudo = 6 # intact allele_stat[locus][allele_id] = [len(s), pseudo] return allele_stat matrix = pd.read_csv(allele_profile, sep='\t', header=None, dtype=str).as_matrix() loci = np.array([not m.startswith('#') for m in matrix[0]]) data = matrix[1:, loci] data[np.in1d(data, ['-', 'n', 'N']).reshape(data.shape)] = '0' data = data.astype(int) data[data < 0] = 0 loci = matrix[0][loci] genomes = matrix[1:, 0] allele_stat = get_allele_info(readFasta(allele_file)) genome_stat = {genome: [0 for l in loci] for genome in genomes} locus_stat = [[ locus, len(allele_stat[locus]), np.mean([v[0] for v in allele_stat[locus].values()]), np.min([v[0] for v in allele_stat[locus].values()]), np.max([v[0] for v in allele_stat[locus].values()]) ] for locus in loci] for g, d in zip(genomes, data): for i, dd in enumerate(d): genome_stat[g][i] = dd * 10 + allele_stat.get(loci[i], {}).get( str(dd), [0, 0])[-1] return genome_stat, locus_stat
def loadBam(prefix, reference, bams, sequences, snps): sites = [] p = subprocess.Popen('samtools mpileup -ABQ0 {0}'.format( ' '.join(bams)).split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) for line in p.stdout: part = line.strip().split('\t') s = int(part[1]) - 1 if s % 100000 == 0: sys.stdout.write('# {0}\n'.format(s)) if sequences[part[0]][s] > 0 or s % 5 == 0: bases = ''.join(part[4::3]) bases = re.sub('[\*\$\+-]', '', re.sub(r'\^.', '', bases.upper())) bases = re.split('(\d+)', bases) for i in range(1, len(bases), 2): bases[i + 1] = bases[i + 1][int(bases[i]):] types, cnts = np.unique(list(''.join(bases[::2])), return_counts=True) if np.sum(cnts) >= 3: if types.size > 1: cnts.sort() sites.append([cnts[-1], np.sum(cnts[:-1])]) else: sites.append([cnts[0], 0]) sites = np.array(sites) ave_depth = np.max([np.median(np.sum(sites, 1)), 2.]) sys.stdout.write( '{3}: Average read depth: {0}; Sites between {1} and {2} will be used for hybrid estimation.\n' .format(ave_depth, ave_depth / 2., ave_depth * 3., prefix)) sites = sites[(ave_depth / 2. <= np.sum(sites, 1)) & (np.sum(sites, 1) <= ave_depth * 3)] m = GaussianMixture(n_components=1, covariance_type='tied') m.fit(sites) best_model = [m.bic(sites), m] for n_components in xrange(2, 6): sys.stdout.write('# Testing {0} components.\n'.format(n_components)) m = GaussianMixture(n_components=n_components, covariance_type='tied') for i in xrange(20): m.fit(sites) bic = m.bic(sites) if bic < best_model[0]: best_model = [bic, m] m = GaussianMixture(n_components=n_components, covariance_type='tied') m = best_model[1] mId = np.argmax(m.means_.T[1] / np.sum(m.means_, 1)) sys.stdout.write( '{3}: Find {0} GMM components. The most divergent group is {1} and counts for {2} of total sites.\n' .format(m.n_components, m.means_[mId].tolist(), m.weights_[mId], prefix)) mDiv = m.means_[mId][0] / np.sum(m.means_[mId]) mDiv = 10 * np.log10([[mDiv, 1 - mDiv], [1 - mDiv, mDiv]]) seq = {n: list(s) for n, s in readFasta(reference).items()} qual = {n: [0] * len(s) for n, s in seq.items()} lowQ, lowC, highQ = 0, 0, 0 p = subprocess.Popen('samtools mpileup -ABQ0 {0}'.format( ' '.join(bams)).split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) for line in p.stdout: part = line.strip().split('\t') s = int(part[1]) - 1 if s % 100000 == 0: sys.stdout.write('# {0}\n'.format(s)) bases = ''.join(part[4::3]) bases = re.sub('[\*\$\+-]', '', re.sub(r'\^.', '', bases.upper())) bases = re.split('(\d+)', bases) for i in range(1, len(bases), 2): bases[i + 1] = bases[i + 1][int(bases[i]):] types, cnts = np.unique(list(''.join(bases[::2])), return_counts=True) if types.size > 0: depth = np.sum(cnts) if cnts.size == 1: g, mId = [cnts[0], 0], 0 elif cnts.size > 1: mId = np.argmax(cnts) g = [cnts[mId], depth - cnts[mId]] seq[part[0]][s] = types[mId] if depth >= 3 and depth / 3. <= ave_depth <= depth * 3.: q = min( 40, max( 1, int(round( np.sum(g * mDiv[0]) - np.sum(g * mDiv[1]), 0)))) qual[part[0]][s] = q if q < 10: lowQ += 1 else: highQ += 1 else: lowC += 1 qual = {n: ''.join([chr(ss + 33) for ss in s]) for n, s in qual.items()} with open(prefix + '.fastq', 'w') as fout: for n, s in seq.items(): fout.write('@{0}\n{1}\n+\n{2}\n'.format(n, ''.join(s), qual[n])) sys.stdout.write( '{0}: {1} good sites; {2} low covered sites; {3} low quality sites;\n'. format(prefix, highQ, lowC, lowQ)) return
def filt_genes(prefix, groups, global_file, conflicts, first_classes=None): outPos = np.ones(16, dtype=bool) outPos[[3, 4, 5, 10, 15]] = False c2 = {c: {} for c in np.unique(conflicts.T[:2])} for c in conflicts: c2[c[0]][c[1]] = c2[c[1]][c[0]] = c[2] conflicts = c2 clust_ref = readFasta(params['clust']) for gene, g in groups.items(): g.T[2] *= g.T[3] g[:] = g[np.argsort(-g.T[2], kind='mergesort')] used, results, run = {}, {}, {} group_id = 0 with open('{0}.Prediction'.format(prefix), 'w') as fout: while len(groups) > 0: genes = get_gene(groups, first_classes, cnt=50) if len(genes) <= 0: continue to_run, to_run_id, min_score, min_rank = [], [], genes[-1][ 1], genes[0][2] genes = {gene: score for gene, score, min_rank in genes} if params['orthology'] in ('ml', 'nj'): for gene, score in genes.items(): if gene not in run: mat = groups[gene] _, bestPerGenome, matInGenome = np.unique( mat.T[1], return_index=True, return_inverse=True) region_score = mat.T[2] / mat[ bestPerGenome[matInGenome], 2] if region_score.size >= bestPerGenome.size * 2: used2, kept = set([]), np.ones(mat.shape[0], dtype=bool) for id, m in enumerate(mat): if m[5] in used2: kept[id] = False else: used2.update(conflicts.get(m[5], {})) mat = mat[kept] _, bestPerGenome, matInGenome = np.unique( mat.T[1], return_index=True, return_inverse=True) region_score = mat.T[2] / mat[ bestPerGenome[matInGenome], 2] if region_score.size > bestPerGenome.size * 3 and len( region_score) > 500: region_score2 = sorted(region_score, reverse=True) cut = region_score2[bestPerGenome.size * 3 - 1] if cut >= params['clust_identity']: cut = min( region_score2[bestPerGenome.size * 5] if len(region_score) > bestPerGenome.size * 5 else params['clust_identity'], 1.0 - 0.6 * (1.0 - params['clust_identity'])) mat = mat[region_score >= cut] to_run.append([mat, clust_ref[mat[0][0]], global_file]) to_run_id.append(gene) working_groups = pool.map(filt_per_group, to_run) #working_groups = [filt_per_group(d) for d in to_run] for gene, working_group in zip(to_run_id, working_groups): groups[gene] = working_group run[gene] = 1 else: _, bestPerGenome, matInGenome = np.unique(mat.T[1], return_index=True, return_inverse=True) region_score = mat.T[2] / mat[bestPerGenome[matInGenome], 2] mat[:] = mat[region_score >= params['clust_identity']] used2, kept = set([]), np.ones(mat.shape[0], dtype=bool) for id, m in enumerate(mat): for mmm in m[6]: if mmm[15] in used2: kept[id] = False break if kept[id]: used2 |= {mmm[15] for mmm in m[6]} mat = mat[kept] _, bestPerGenome, matInGenome = np.unique(mat.T[1], return_index=True, return_inverse=True) while len(genes): score, gene = max([[ np.sum(groups[gene][np.unique(groups[gene].T[1], return_index=True)[1]].T[2]), gene ] for gene in genes]) if score < min_score: break mat = groups.pop(gene, []) genes.pop(gene) paralog, paralog2 = 0, 0 supergroup = {} used2 = {} for m in mat: gid = m[5] conflict = used.get(gid, None) if conflict is not None: if not isinstance(conflict, int): superC = results[conflict] supergroup[superC] = supergroup.get(superC, 0) + 1 elif conflict > 0: if m[6].shape[0] <= 1 and m[3] >= params[ 'clust_identity']: paralog = 1 break else: paralog2 += 1 m[3] = -1 else: for g2, gs in conflicts.get(gid, {}).items(): if gs == 1: if g2 not in used: used2[g2] = m[0] elif gs == 2: used2[g2] = 1 else: used[g2] = 0 if paralog or paralog2 * 3 >= mat.shape[0]: continue else: used.update(used2) pangene = mat[0][0] if len(supergroup): pg, pid = max(supergroup.items(), key=itemgetter(1)) if pid * 3 >= mat.shape[0] or (pid * 5 >= mat.shape[0] and pid > 1): pangene = pg results[mat[0][0]] = pangene logger( '{4} / {5}: pan gene "{3}" : "{0}" picked from rank {1} and score {2}' .format(mat[0][0], min_rank, score, pangene, len(results), len(groups) + len(results))) for grp in mat[mat.T[3] > 0]: group_id += 1 for g in grp[6]: fout.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format( pangene, min_rank, group_id, grp[1], '\t'.join(g[outPos].astype(str).tolist()))) return '{0}.Prediction'.format(prefix)
def ortho(args): global params params.update(add_args(args).__dict__) params.update(externals) global pool pool = Pool(params['n_thread']) genomes, genes = readGFF(params['GFFs']) genes = addGenes(genes, params['genes']) if params.get('old_prediction', None) is None: params['old_prediction'] = params['prefix'] + '.old_prediction.npz' old_predictions = {} for n, g in genes.items(): if g[1] != '': if g[1] not in old_predictions: old_predictions[g[1]] = [] old_predictions[g[1]].append([n, g[2], g[3], g[4]]) for gene, g in old_predictions.items(): old_predictions[gene] = np.array(sorted(g), dtype=object) np.savez_compressed(params['old_prediction'], **old_predictions) del old_predictions, n, g genomes, genes, encodes = encodeNames(genomes, genes) if params.get('prediction', None) is None: first_classes = load_priority(params.get('priority', ''), genes, encodes) if params.get('clust', None) is None: params['genes'] = writeGenes('{0}.genes'.format(params['prefix']), genes, first_classes) del genes params['clust'], params['uc'] = getClust( params['prefix'], params['genes'], dict(identity=params['clust_identity'], coverage=params['clust_match_prop'], n_thread=params['n_thread'])) genes = readFasta(params['clust']) if params.get('self_bsn', None) is None: params['self_bsn'] = params['prefix'] + '.self_bsn.npy' orthoGroup = get_similar_pairs(params['prefix'], params['clust'], first_classes, params) np.save(params['self_bsn'], orthoGroup) else: orthoGroup = np.load(params['self_bsn']) orthoGroup = dict([[tuple(g), 1] for g in orthoGroup] + [[(g[1], g[0]), 1] for g in orthoGroup] + [[(g, g), 0] for g in genes]) if params.get('map_bsn', None) is None or params.get( 'conflicts', None) is None: blastab, conflicts = get_map_bsn(params['prefix'], params['clust'], genomes, orthoGroup) blastab = np.split( blastab, np.cumsum(np.unique(blastab.T[0], return_counts=True)[1])[:-1]) params['map_bsn'], params['conflicts'] = params[ 'prefix'] + '.map_bsn.npz', params['prefix'] + '.conflicts.npz' np.savez_compressed(params['map_bsn'], **{str(b[0, 0]): b for b in blastab}) np.savez_compressed(params['conflicts'], conflicts=conflicts) del blastab, conflicts if params.get('global', None) is None: params['global'] = params['prefix'] + '.global.npy' global_differences = global_difference(params['map_bsn'], orthoGroup, 3000) np.save(params['global'], global_differences) del global_differences blastab = precluster(params['map_bsn'], params['global']) #np.savez_compressed(params['map_bsn'], **blastab) params['prediction'] = filt_genes( params['prefix'], blastab, params['global'], np.load(params['conflicts'])['conflicts'], first_classes) else: genes = {n: s[-1] for n, s in genes.items()} pool.close() old_predictions = dict(np.load( params['old_prediction'])) if 'old_prediction' in params else {} write_output(params['prefix'], params['prediction'], genomes, genes, old_predictions)
def get_quality(self, reference, reads ) : if parameters['mapper'] == 'minimap2' : bams = self.__run_minimap(prefix, reference, reads, ) elif parameters['mapper'] != 'bwa' : bams = self.__run_bowtie(prefix, reference, reads, ) else : bams = self.__run_bwa(prefix, reference, reads, ) sequence = readFasta(reference) for n, s in sequence.items() : q = ['!'] * len(s) sequence[n] = [s, q] sites = { n:np.array([0 for ss in s[1] ]) for n, s in sequence.items() } for bam in bams : if bam is not None : depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(bam=bam, **parameters).split(), stdout=PIPE, universal_newlines=True) for line in depth.stdout : part = line.strip().split() if len(part) > 2 and float(part[2]) > 0 : sites[part[0]][int(part[1]) - 1] += float(part[2]) sites = {n:[s.size, np.max([np.median(s), np.exp(np.mean(np.log(s + 0.5)))-0.5]), 0.] for n, s in sites.items()} depth = np.array(list(sites.values())) depth = depth[np.argsort(-depth.T[0])] size = np.sum(depth.T[0]) acc = [0, 0] for d in depth : acc[0], acc[1] = acc[0] + d[0], acc[1] + d[0]*d[1] if acc[0] *2 >= size : break ave_depth = acc[1]/acc[0] exp_mut_depth = max(ave_depth * 0.2, 2.) for n, s in sites.items() : s[2] = s[1]/ave_depth logger('Average read depth: {0}'.format(ave_depth)) sequence = {n:s for n, s in sequence.items() if sites[n][1]>0.} with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout : for n, s in sorted(sequence.items()) : fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[0][site:(site+100)] for site in xrange(0, len(s[0]), 100)]))) bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None]) pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters) Popen( pilon_cmd.split(), stdout=PIPE, universal_newlines=True ).communicate() if not os.path.isfile('{0}.mapping.vcf'.format(prefix)) : pilon_cmd = '{pilon} --fix snps,indels,gaps,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters) Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() cont_depth = [float(d) for d in parameters['cont_depth'].split(',')] logger('Contigs with less than {0} depth will be removed from the assembly'.format(cont_depth[0]*ave_depth)) logger('Contigs with more than {0} depth will be treated as duplicates'.format(cont_depth[1]*ave_depth)) indels = [] with open('{0}.mapping.vcf'.format(prefix)) as fin, open('{0}.mapping.difference'.format(prefix), 'w') as fout : for line in fin : if line.startswith('#') : continue part = line.strip().split('\t') if sites[part[0]][2] < cont_depth[0] or sites[part[0]][2] >= cont_depth[1] : continue if part[-1] == '1/1': if len(part[3]) > 1 : indels.append([part[0], max(0, int(site)-1), int(site)-1+len(part[3])+2]) elif len(part[4]) > 1 and part[4] != '<DUP>' : indels.append([part[0], max(0, int(site)-2), int(site)-1+len(part[3])+2]) try: if part[-1] == '0/0' and len(part[3]) == 1 and len(part[4]) == 1 : pp = part[7].split(';') dp = float(pp[0][3:]) af = 100 - sorted([float(af) for af in pp[6][3:].split(',')])[-1] if af <= 20 and dp >= 2 and dp * af/100. <= exp_mut_depth and (part[6] == 'PASS' or (part[6] == 'LowCov' and parameters['metagenome'])) : site = int(part[1])-1 qual = chr(int(pp[4][3:])+33) sequence[part[0]][1][site] = qual else : fout.write(line) else : fout.write(line) except : fout.write(line) for n, s, e in indels : sequence[n][1][s:e] = ['!'] * len(sequence[n][1][s:e]) if self.snps is not None : for n, snvs in self.snps.items() : for site, snv in snvs : if snv.find('N') >= 0 : continue if snv.startswith('+') : s, e = site-4, site+3+len(snv) else : s, e = site-4, site+4 for k in xrange(s, e) : sequence[n][1][k] = max(chr(40+33), sequence[n][1][k]) with open('{0}.result.fastq'.format(prefix), 'w') as fout : p = prefix.rsplit('/', 1)[-1] for n, (s, q) in sequence.items() : if sites[n][2] >= cont_depth[0] : fout.write( '@{0} {3} {4} {5}\n{1}\n+\n{2}\n'.format( p+'_'+n, s, ''.join(q), *sites[n] ) ) os.unlink( '{0}.mapping.vcf'.format(prefix) ) logger('Final result is written into {0}'.format('{0}.result.fastq'.format(prefix))) return '{0}.result.fastq'.format(prefix)
def do_polish(self, reference, reads, reassemble=False, onlySNP=False) : if parameters.get('SNP', None) is not None : return self.do_polish_with_SNPs(reference, parameters['SNP']) else : if parameters['mapper'] == 'minimap2' : bams = self.__run_minimap(prefix, reference, reads ) elif parameters['mapper'] != 'bwa' : bams = self.__run_bowtie(prefix, reference, reads ) else : bams = self.__run_bwa(prefix, reference, reads ) sites = {} for bam in bams : if bam is not None : depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(bam=bam, **parameters).split(), stdout=PIPE, universal_newlines=True) for line in depth.stdout : part = line.strip().split() if len(part) > 2 and float(part[2]) > 0 : sites[part[0]] = 1 sequence = readFasta(reference) sequence = {n:s for n,s in sequence.items() if n in sites} with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout : for n, s in sorted(sequence.items()) : fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[site:(site+100)] for site in xrange(0, len(s), 100)]))) bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None]) if reassemble : pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters) Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() else : pilon_cmd = '{pilon} --fix all --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters) Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() if not os.path.isfile('{0}.mapping.vcf'.format(prefix)) : pilon_cmd = '{pilon} --fix snps,indels,gaps --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters) Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() snps = [] with open('{0}.mapping.vcf'.format(prefix)) as fin, open('{0}.mapping.changes'.format(prefix), 'w') as fout : for line in fin : if line.startswith('#') : continue part = line.strip().split('\t') if part[-1] != '0/0': try : if (part[6] == 'PASS' or float(part[7][-4:]) >= 0.75) and re.match(r'^[ACGTN]+$', part[4]): if (not onlySNP) or (len(part[3]) == 1 and len(part[4]) == 1 ) : snps.append( [ part[0], int(part[1])-1, part[3], part[4] ] ) fout.write(line) except : pass os.unlink('{0}.mapping.vcf'.format(prefix)) for n in sequence.keys() : sequence[n] = list(sequence[n]) for n, site, ori, alt in reversed(snps) : s = sequence[n] end = site + len(ori) s[site:end] = alt logger('Observed and corrected {0} changes using PILON'.format(len(snps))) with open('{0}.fasta'.format(prefix), 'w') as fout : for n, s in sorted(sequence.items()) : s = ''.join(s) fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[site:(site+100)] for site in xrange(0, len(s), 100)]))) return '{0}.fasta'.format(prefix)
def loadBam(prefix, reference, bams, sequences, snps): sequence = readFasta(reference) sequence = {n: [s, [0] * len(s)] for n, s in sequence.items()} sites = {} for bam in bams: if bam is not None: depth = subprocess.Popen('{samtools} depth -q 0 -Q 0 {bam}'.format( bam=bam, **externals).split(), stdout=subprocess.PIPE, universal_newlines=True) try: d = pd.read_csv(depth.stdout, sep='\t').values sites.update({cName: 1 for cName in np.unique(d.T[0])}) except: pass sequence = {n: s for n, s in sequence.items() if n in sites} with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout: for n, s in sorted(sequence.items()): fout.write('>{0}\n{1}\n'.format( n, '\n'.join([ s[0][site:(site + 100)] for site in xrange(0, len(s[0]), 100) ]))) bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None]) pilon_cmd = '{pilon} --fix snps,indels,gaps --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format( prefix=prefix, bam_opt=bam_opt, **externals) subprocess.Popen(pilon_cmd.split(), stdout=subprocess.PIPE, universal_newlines=True).communicate() uncertains = [] with open('{0}.mapping.vcf'.format(prefix)) as fin: for line in fin: if line.startswith('#'): continue part = line.strip().split('\t') if sequences[part[0]][int(part[1]) - 1] >= 0: if len(part[3]) == 1 and len(part[4]) == 1: pp = part[7].split(';') dp = float(pp[0][3:]) if dp >= 3: qd = int(pp[4][3:]) if part[-1] == '0/1' or qd < 10: bcs = sorted( [float(bc) for bc in pp[5][3:].split(',')]) uncertains.append([bcs[-1], np.sum(bcs[:-1])]) uncertains = np.array(uncertains) p = np.sum(uncertains.T[0]) / np.sum(uncertains) qPerRead = 10 * (np.log10(p) - np.log10(1 - p)) for n in sequence: sequence[n][0] = list(sequence[n][0]) highQ, lowQ, lowC = 0, 0, 0 with open('{0}.mapping.vcf'.format(prefix)) as fin: for line in fin: if line.startswith('#'): continue part = line.strip().split('\t') if len(part[3]) == 1 and len(part[4]) == 1: s = int(part[1]) - 1 pp = part[7].split(';') dp = float(pp[0][3:]) qd = int(pp[4][3:]) if part[-1] == '0/1' or qd < 10: bcs = np.array([int(bc) for bc in pp[5][3:].split(',')]) if np.sum(bcs) > 0: sequence[part[0]][0][s] = ['A', 'C', 'G', 'T'][np.argmax(bcs)] else: sequence[part[0]][0][s] = part[3] if dp < 3: lowC += 1 else: bcs.sort() bcs = [bcs[-1], np.sum(bcs[:-1])] q1 = binom.cdf(bcs[0], bcs[0] + bcs[1], p) q2 = qPerRead * (bcs[0] - bcs[1]) if q1 >= 0.05 else 1 if q2 >= 10: highQ += 1 else: lowQ += 1 sequence[part[0]][1][s] = min(40, max(1, int(q2))) else: if dp < 3: lowC += 1 else: if qd >= 10: highQ += 1 else: lowQ += 1 sequence[part[0]][1][s] = qd if part[-1] == '1/1': sequence[part[0]][0][s] = part[4] logger( '{0}: Expected mix-up: {1} {2} ; Got highQ {3} ; lowQ {4} ; lowC {5}'. format(prefix, uncertains.shape[0], p, highQ, lowQ, lowC)) with open('{0}.metaCaller.fastq'.format(prefix), 'w') as fout: p = prefix.rsplit('/', 1)[-1] for n, (s, q) in sequence.items(): fout.write('@{0}\n{1}\n+\n{2}\n'.format( p + '_' + n, ''.join(s), ''.join([chr(qq + 33) for qq in q]))) os.unlink('{0}.mapping.vcf'.format(prefix)) os.unlink('{0}.mapping.fasta'.format(prefix)) os.unlink('{0}.mapping.reference.fasta'.format(prefix)) return '{0}.metaCaller.fastq'.format(prefix)
def get_quality(self, reference, reads): if parameters['mapper'] != 'bwa': bams = self.__run_bowtie(reference, reads) else: bams = self.__run_bwa(reference, reads) sequence = readFasta(filename=reference, qual=0) for n, s in sequence.iteritems(): s[1] = list(s[1]) sites = { n: np.array([0 for ss in s[1]]) for n, s in sequence.iteritems() } for bam in bams: if bam is not None: depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format( bam=bam, **parameters).split(), stdout=PIPE).communicate()[0] for line in depth.split('\n'): part = line.strip().split() if len(part) > 2 and float(part[2]) > 0: sites[part[0]][int(part[1]) - 1] += float(part[2]) sites = {n: [s.size, np.mean(s), 0.] for n, s in sites.iteritems()} depth = np.array(sites.values()) depth = depth[np.argsort(-depth.T[0])] size = np.sum(depth.T[0]) acc = [0, 0] for d in depth: acc[0], acc[1] = acc[0] + d[0], acc[1] + d[0] * d[1] if acc[0] * 2 >= size: break ave_depth = acc[1] / acc[0] exp_mut_depth = max(ave_depth * 0.2, 1.) for n, s in sites.iteritems(): s[2] = s[1] / ave_depth logger('Average read depth: {0}'.format(ave_depth)) logger('Sites with over {0} or 15% unsupported reads is not called'. format(exp_mut_depth)) sequence = {n: s for n, s in sequence.iteritems() if sites[n][1] > 0.} with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout: for n, s in sorted(sequence.items()): fout.write('>{0}\n{1}\n'.format( n, '\n'.join([ s[0][site:(site + 100)] for site in range(0, len(s[0]), 100) ]))) bam_opt = ' '.join( ['--bam {0}'.format(b) for b in bams if b is not None]) pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format( bam_opt=bam_opt, **parameters) Popen(pilon_cmd.split(), stdout=PIPE).communicate() cont_depth = [float(d) for d in parameters['cont_depth'].split(',')] logger( 'Contigs with less than {0} depth will be removed from the assembly' .format(cont_depth[0] * ave_depth)) logger( 'Contigs with more than {0} depth will be treated as duplicates'. format(cont_depth[1] * ave_depth)) with open('{0}.mapping.vcf'.format(prefix)) as fin, open( '{0}.mapping.difference'.format(prefix), 'w') as fout: for line in fin: if line.startswith('#'): continue part = line.strip().split('\t') if sites[part[0]][2] < cont_depth[0] or sites[ part[0]][2] >= cont_depth[1]: continue try: if part[-1] == '0/0' and len(part[3]) == 1 and len( part[4]) == 1: dp, af = float(part[7].split(';', 1)[0][3:]), float( part[7][-4:]) if af < 0.15 and dp >= 3 and dp * af <= exp_mut_depth: if part[6] == 'PASS' or (part[6] == 'LowCov' and parameters['metagenome']): site = int(part[1]) - 1 qual = chr(int(part[7].split(';')[4][3:]) + 33) sequence[part[0]][1][site] = qual else: fout.write(line) else: fout.write(line) except: fout.write(line) if self.snps is not None: for n, snvs in self.snps.iteritems(): for site, snv in snvs: if snv.find('N') >= 0: continue if snv.startswith('+'): s, e = site - 4, site + 3 + len(snv) else: s, e = site - 4, site + 4 for k in range(s, e): sequence[n][1][k] = max(chr(40 + 33), sequence[n][1][k]) with open('{0}.result.fastq'.format(prefix), 'w') as fout: for n, (s, q) in sequence.iteritems(): if sites[n][2] >= cont_depth[0]: fout.write('@{0} {3} {4} {5}\n{1}\n+\n{2}\n'.format( n, s, ''.join(q), *sites[n])) os.unlink('{0}.mapping.vcf'.format(prefix)) logger('Final result is written into {0}'.format( '{0}.result.fastq'.format(prefix))) return '{0}.result.fastq'.format(prefix)