def assemble_amplicons(contigs_fa=None, ref_fa=None, ref_gtf=None, outdir='.', sample_id='sampleXX', padding=50, min_contig_len=200, keep_tmp=False, quiet=False, logfile=None, debug=False): """ Pipeline step to assemble contigs using reference and amplicon regions Args: contigs_fa (str): Path to fasta file with assembled contigs ref_fa (str): Path to reference fasta file ref_gtf (str): Path to reference GTF file with amplicons outdir (str): Path to output directory sample_id (str): Name to append to scaffold sequence padding (int): Bases to include outside reference annotation min_contig_len (int): Minimum contig length for tiling path keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out_assembly (str): Path to assembled amplicons (FASTA) out_summary (str): Path to assembly summary out_padded (str): Path to padded output file """ # Check dependencies sysutils.check_dependency('nucmer') sysutils.check_dependency('delta-filter') sysutils.check_dependency('show-tiling') # Outputs out_assembly = os.path.join(outdir, 'amplicon_assembly.fna') out_summary = os.path.join(outdir, 'amplicon_summary.txt') out_padded = os.path.join(outdir, 'amplicon_padded.out') if os.path.exists(out_padded): os.unlink(out_padded) # Temporary directory tempdir = sysutils.create_tempdir('assemble_amplicons', None, quiet, logfile) # Create fasta file with sequence IDs only (remove decription) tmp_contigs_fa = sequtils.clean_seqnames_file(contigs_fa, tempdir) # Load reference sequence(s) refseqs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')} # For each amplicon, extract the sequence from the reference and scaffold using nucmer amplicon_alignments = [] amps = [ gl for gl in gtfparse.gtf_parser(ref_gtf) if gl.feature == 'amplicon' ] for gl in amps: msg = 'Amplicon ref|%s|reg|%s\n' % (gl.chrom, gl.attrs['name']) sysutils.log_message(msg, quiet, logfile) # Extract reference amplicon amp_s = max(0, (gl.start - 1) - padding) amp_e = min(len(refseqs[gl.chrom]), gl.end + padding) ampseq = refseqs[gl.chrom].seq[amp_s:amp_e] amplicon_fa = os.path.join(tempdir, 'subject.fa') with open(amplicon_fa, 'w') as outh: print('>ref|%s|reg|%s' % (gl.chrom, gl.attrs['name']), file=outh) print(sequtils.wrap(str(ampseq)), file=outh) # Align with nucmer fil, til = alignutils.align_nucmer(tmp_contigs_fa, amplicon_fa, tempdir, min_contig_len=min_contig_len, quiet=quiet, logfile=logfile, debug=debug) # Skip everything else if debugging if debug: continue # Parse tiling and show alignments trows = [alignutils.TilingRow(l) for l in open(til, 'rU')] if not trows: amplicon_alignments.append((gl.chrom, gl.attrs['name'], None)) else: # Initialize alignment amp_seq = SeqIO.read(amplicon_fa, 'fasta') combined = alignutils.EmptyReferenceAlignment( str(amp_seq.seq).lower()) for tr in trows: out = alignutils.show_aligns(tr.ref, tr.qry, fil) for nucaln in alignutils.parse_show_aligns(out): combined = combined.merge_alignments(nucaln) with open(out_padded, 'a') as outh: print('%s\n%s\n%s' % (tr, combined.raln(), combined.qaln()), file=outh) amplicon_alignments.append((gl.chrom, gl.attrs['name'], combined)) # Cleanup for f in [fil, til, amplicon_fa]: if os.path.isfile(f): os.unlink(f) # Write to output files with open(out_assembly, 'w') as outseq, open(out_summary, 'w') as outsum: for ref_id, reg, combined in amplicon_alignments: amp_id = sequtils.make_seq_id(sid=sample_id, ref=ref_id, reg=reg) if combined is None: msg1 = '%s\tFAIL\t%d' % (amp_id, 0) msg2 = u'%s\tFAIL\t%d\t%s\n' % (amp_id, 0, u"👎🏼") if logfile is not None: print(u'%s\tFAIL\t%d\t%s' % (amp_id, 0, u"👎🏼"), file=logfile) else: scaf, s, e = combined.scaffold2() msg1 = '%s\tPASS\t%d' % (amp_id, len(scaf)) msg2 = u'%s\tPASS\t%d\t%s\n' % (amp_id, len(scaf), u"👍🏼") print('>%s' % (amp_id), file=outseq) print('%s' % sequtils.wrap(scaf), file=outseq) print(msg1, file=outsum) sysutils.log_message(msg2, quiet, logfile) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'assemble_amplicons', quiet, logfile) return out_assembly, out_summary, out_padded
def align_reads( fq1=None, fq2=None, fqU=None, ref_fa=None, outdir='.', bt2_preset='sensitive-local', sample_id='sampleXX', no_realign=False, remove_duplicates=False, encoding=None, ncpu=1, xmx=sysutils.get_java_heap_size(), keep_tmp=False, quiet=False, logfile=None, debug=False, ): """ Pipeline step to align reads Args: fq1 (str): Path to fastq file with read 1 fq2 (str): Path to fastq file with read 2 fqU (str): Path to fastq file with unpaired reads ref_fa (str): Path to reference fasta file outdir (str): Path to output directory bt2_preset (str): Bowtie2 preset to use for alignment sample_id (str): Read group ID no_realign (bool): Do not realign indels remove_duplicates (bool): Remove duplicates from final alignment encoding (str): Quality score encoding ncpu (int): Number of CPUs to use xmx (int): Maximum heap size for JVM in GB keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out_aligned (str): Path to aligned BAM file out_bt2 (str): Path to bowtie2 report """ # Check inputs if fq1 is not None and fq2 is not None and fqU is None: input_reads = "paired" # Paired end elif fq1 is None and fq2 is None and fqU is not None: input_reads = "single" # Single end elif fq1 is not None and fq2 is not None and fqU is not None: input_reads = "both" else: msg = "incorrect input reads; requires either " msg += "(--fq1 AND --fq2) OR (--fqU) OR (--fq1 AND --fq2 AND --fqU)" raise MissingRequiredArgument(msg) if encoding is None: if input_reads == 'single': encoding = helpers.guess_encoding(fqU) else: encoding = helpers.guess_encoding(fq1) # Check dependencies sysutils.check_dependency('bowtie2') sysutils.check_dependency('samtools') sysutils.check_dependency('picard') # Identify correct command for GATK GATK_BIN = sysutils.determine_dependency_path(['gatk', 'gatk3']) # Set JVM heap argument (for GATK) JAVA_HEAP = '_JAVA_OPTIONS="-Xmx%dg"' % xmx # Outputs out_aligned = os.path.join(outdir, 'aligned.bam') out_bt2 = os.path.join(outdir, 'aligned.bt2.out') # Temporary directory tempdir = sysutils.create_tempdir('align_reads', None, quiet, logfile) # Copy and index initial reference curref = os.path.join(tempdir, 'initial.fasta') cmd1 = ['cp', ref_fa, curref] cmd2 = ['samtools', 'faidx', curref] cmd3 = [ 'picard', 'CreateSequenceDictionary', 'R=%s' % curref, 'O=%s' % os.path.join(tempdir, 'initial.dict') ] cmd4 = ['bowtie2-build', curref, os.path.join(tempdir, 'initial')] sysutils.command_runner([cmd1, cmd2, cmd3, cmd4], 'align_reads:index', quiet, logfile, debug) # Align with bowtie2 cmd5 = [ 'bowtie2', '-p', '%d' % ncpu, '--phred33' if encoding == "Phred+33" else '--phred64', '--no-unal', '--rg-id', sample_id, '--rg', 'SM:%s' % sample_id, '--rg', 'LB:1', '--rg', 'PU:1', '--rg', 'PL:illumina', '--%s' % bt2_preset, '-x', '%s' % os.path.join(tempdir, 'initial'), ] if input_reads in [ 'paired', 'both', ]: cmd5 += [ '-1', fq1, '-2', fq2, ] elif input_reads in [ 'single', 'both', ]: cmd5 += [ '-U', fqU, ] cmd5 += [ '-S', os.path.join(tempdir, 'aligned.bt2.sam'), ] cmd5 += [ '2>', out_bt2, ] try: sysutils.command_runner([ cmd5, ], 'align_reads:bowtie2', quiet, logfile, debug) except PipelineStepError as e: if os.path.exists(out_bt2): with open(out_bt2, 'r') as fh: print('[--- bowtie2 stderr ---]\n%s' % fh.read(), file=sys.stderr) raise cmd6 = [ 'samtools', 'view', '-u', os.path.join(tempdir, 'aligned.bt2.sam'), '|', 'samtools', 'sort', '>', os.path.join(tempdir, 'sorted.bam'), ] cmd7 = [ 'samtools', 'index', os.path.join(tempdir, 'sorted.bam'), ] sysutils.command_runner([ cmd6, cmd7, ], 'align_reads:samsort', quiet, logfile, debug) cur_bam = os.path.join(tempdir, 'sorted.bam') if remove_duplicates: sysutils.log_message('[--- Removing duplicates ---]', quiet, logfile) else: sysutils.log_message('[--- Marking duplicates ---]', quiet, logfile) # MarkDuplicates cmd8 = [ 'picard', 'MarkDuplicates', 'CREATE_INDEX=true', 'USE_JDK_DEFLATER=true', 'USE_JDK_INFLATER=true', 'M=%s' % os.path.join(tempdir, 'rmdup.metrics.txt'), 'I=%s' % cur_bam, 'O=%s' % os.path.join(tempdir, 'rmdup.bam'), ] if remove_duplicates: cmd8 += [ 'REMOVE_DUPLICATES=true', ] sysutils.command_runner([ cmd8, ], 'align_reads:markdups', quiet, logfile, debug) cur_bam = os.path.join(tempdir, 'rmdup.bam') if no_realign: print('[--- Skipping realignment ---]', file=sys.stderr) else: # RealignerTargetCreator cmd9 = [ JAVA_HEAP, GATK_BIN, '-T', 'RealignerTargetCreator', '-I', cur_bam, '-R', curref, '-o', os.path.join(tempdir, 'tmp.intervals'), ] # IndelRealigner cmd10 = [ JAVA_HEAP, GATK_BIN, '-T', 'IndelRealigner', '--use_jdk_deflater', '--use_jdk_inflater', '-maxReads', '1000000', '-dt', 'NONE', '-I', cur_bam, '-R', curref, '-targetIntervals', os.path.join(tempdir, 'tmp.intervals'), '-o', os.path.join(tempdir, 'realign.bam') ] sysutils.command_runner([ cmd9, cmd10, ], 'align_reads:realign', quiet, logfile, debug) cur_bam = os.path.join(tempdir, 'realign.bam') # Check that cur_bam was created if not os.path.exists(cur_bam): msg = "BAM does not exist: %s" % cur_bam raise sysutils.PipelineStepError(msg) cmd11a = [ 'rm', '-f', out_aligned, ] cmd11b = [ 'mv', cur_bam, out_aligned, ] cmd11c = [ 'samtools', 'index', out_aligned, ] sysutils.command_runner([ cmd11a, cmd11b, cmd11c, ], 'align_reads:copy', quiet, logfile, debug) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'align_reads', quiet, logfile) return out_aligned, out_bt2
def vcf_to_consensus( vcf=None, outdir='.', sampidx=0, min_dp=5, major=0.5, minor=0.2, keep_tmp=False, quiet=False, logfile=None, ): """ Pipeline step to create consensus sequence from VCF Args: vcf (str): Path to variant calls (VCF) outdir (str): Path to output directory sampidx (int): Index for sample if multi-sample VCF min_dp (int): Minimum depth to call site major (float): Allele fraction to make unambiguous call minor (float): Allele fraction to make ambiguous call keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: """ # Check inputs if vcf is None: raise sysutils.PipelineStepError('VCF file is required') # Outputs out_fasta = os.path.join(outdir, 'consensus.fna') sysutils.log_message('[--- vcf_to_consensus ---]\n', quiet, logfile) sysutils.log_message('VCF: %s\n' % vcf, quiet, logfile) # Parse VCF chroms = [] samples = [] if os.path.splitext(vcf)[1] == '.gz': lines = (l.decode('utf-8').strip('\n') for l in gzip.open(vcf, 'rb')) else: lines = (l.strip('\n') for l in open(vcf, 'r')) # Parse headers for l in lines: if l.startswith('##'): m = re.match('##contig=<ID=(\S+),length=(\d+)>', l) if m: chroms.append((m.group(1), int(m.group(2)))) else: assert l.startswith('#') cols = l.strip('#').split('\t') samples = cols[9:] break if len(samples) <= sampidx: msg = 'Sample index %d does not exist. Samples: %s' % (sampidx, str(samples)) raise sysutils.PipelineStepError(msg) chrom_ordered = [_[0] for _ in chroms] chroms = dict(chroms) newseqs = dict((c, ['.'] * chroms[c]) for c in list(chroms.keys())) imputed = dict((c, [''] * chroms[c]) for c in list(chroms.keys())) for l in lines: chrom, start, stop, RA, AA, info, svals = parse_vcf_sample(l, sampidx) gt = call_gt(RA, AA, svals, min_dp, major, minor) if gt is None: imputed[chrom][start - 1] = RA[0].lower() else: if len(gt) == 1: newseqs[chrom][start - 1] = gt[0] imputed[chrom][start - 1] = gt[0] else: if all(len(_) == 1 for _ in gt): newseqs[chrom][start - 1] = sequtils.get_ambig(gt) imputed[chrom][start - 1] = sequtils.get_ambig(gt) else: newseqs[chrom][start - 1] = ''.join(gt[0]) imputed[chrom][start - 1] = ''.join(gt[0]) # newseqs = imputed sysutils.log_message('Output FASTA: %s\n' % out_fasta, quiet, logfile) with open(out_fasta, 'w') as outh: for chrom in chrom_ordered: new_seqid = sequtils.update_seq_id(chrom, samples[sampidx]) new_seq = ''.join(newseqs[chrom]).replace('.', 'n') m = re.match('^(?P<pre>n*)(?P<seq>[^n].+[^n])?(?P<suf>n*)$', new_seq) if m.group('seq') is None: msg = u'%s\tFAIL\t%d\t%s\n' % (new_seqid, 0, u"👎🏼") # Don't output sequence if not present else: msg = u'%s\tPASS\t%d\t%s\n' % (new_seqid, len( m.group('seq')), u"👍🏼") print('>%s SM:%s' % (new_seqid, samples[sampidx]), file=outh) print(sequtils.wrap(new_seq), file=outh) sysutils.log_message(msg, quiet, logfile) return out_fasta
def ph_parser( haplotypes_fa=None, outdir='.', prefix=None, keep_gaps=False, quiet=False, logfile=None, ): """ Args: haplotypes_fa (str): Path to haplotype file from PredictHaplo (fasta-ish) outdir (str): Path to output directory prefix (str): Prefix to add to sequence names keep_gaps (bool): Do not remove gaps from alignment quiet (bool): Do not write output to console logfile (file): Append console output to this file Returns: """ summary_txt = open(os.path.join(outdir, 'ph_summary.txt'), 'w') newseq_fa = open(os.path.join(outdir, 'ph_haplotypes.fna'), 'w') num_hap = 0 freq = [] fasta = [] newseq = None ph = os.path.basename(haplotypes_fa).split(".")[0] for l in open(haplotypes_fa, 'r'): l = l.strip('\n') if l.startswith('>'): num_hap += 1 if newseq is not None: fasta.append(newseq) newseq = [ph, l.strip(">"), None, ""] elif l.startswith(';'): parts = l.strip(';').split(':') if parts[0] == 'Freq': freq.append(float(parts[1])) newseq[2] = float(parts[1]) else: pass else: newseq[3] += l.strip('\n') fasta.append(newseq) if len(fasta) == num_hap: sysutils.log_message("Number of haplotypes is correct.\n", quiet, logfile) freq_sqrd = [x**2 for x in freq] freq_sqrd_sum = sum(freq_sqrd) hap_div = ((old_div(7000, (7000 - 1))) * (1 - freq_sqrd_sum)) print("PH_num_hap %s" % num_hap, file=summary_txt) print("PH_hap_diversity %s" % hap_div, file=summary_txt) seqlen = len(fasta[0][-1]) equal_len = True for seq in fasta: sl = len(seq[-1]) if sl != seqlen: sysutils.log_message( "Sequence length is different for each haplotype.\n", quiet, logfile) equal_len = False else: pass if equal_len == True: print("PH_seq_len %s" % seqlen, file=summary_txt) for sub_list in fasta: if prefix is None: print('>sid|%s_%s|reg|%s| Freq=%s' % (sub_list[0], sub_list[1], sub_list[0].split("_")[-1], sub_list[2]), file=newseq_fa) else: print('>sid|%s_%s_%s|reg|%s| Freq=%s' % (prefix, sub_list[0], sub_list[1], sub_list[0].split("_")[-1], sub_list[2]), file=newseq_fa) if keep_gaps: print("%s" % (sub_list[-1]), file=newseq_fa) else: print("%s" % (sub_list[-1].replace('-', "")), file=newseq_fa) sysutils.log_message( "Summary and FASTA file completed for %s.\n" % haplotypes_fa, quiet, logfile) summary_txt.close() newseq_fa.close()
def sample_reads( fq1=None, fq2=None, fqU=None, outdir='.', nreads=None, frac=None, seed=None, quiet=False, logfile=None, debug=False, ): """ Args: fq1 (str): Path to fastq file with read 1 fq2 (str): Path to fastq file with read 2 fqU (str): Path to fastq file with unpaired reads outdir (str): Path to output directory nreads (int): Number of reads to sample frac (float): Fraction of reads to sample seed (int): Seed for random number generator quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out1 (str): Path to sampled fastq file with read 1 out2 (str): Path to sampled fastq file with read 2 outU (str): Path to sampled fastq file with unpaired reads """ # Check inputs if fq1 is not None and fq2 is not None and fqU is None: input_reads = "paired" # Paired end elif fq1 is None and fq2 is None and fqU is not None: input_reads = "single" # Single end elif fq1 is not None and fq2 is not None and fqU is not None: input_reads = "both" else: msg = "incorrect input reads; requires either " msg += "(--fq1 AND --fq2) OR (--fqU) OR (--fq1 AND --fq2 AND --fqU)" raise MissingRequiredArgument(msg) # Check dependencies sysutils.check_dependency('seqtk') # Set seed seed = seed if seed is not None else random.randrange(1, 1000) sysutils.log_message('[--- sample_reads ---] Random seed = %d\n' % seed, quiet, logfile) # Set nreads/frac if frac is not None: if frac <= 0 or frac > 1: raise sysutils.PipelineStepError('--frac must be > 0 and <= 1.') frac_arg = '%f' % frac else: frac_arg = '%d' % nreads cmds = None if input_reads == 'single': out1 = out2 = None outU = os.path.join(outdir, 'sample_U.fastq') cmds = [ [ 'seqtk', 'sample', '-s%d' % seed, fqU, frac_arg, '>', outU, ], ] elif input_reads == 'paired': out1 = os.path.join(outdir, 'sample_1.fastq') out2 = os.path.join(outdir, 'sample_2.fastq') outU = None cmds = [ [ 'seqtk', 'sample', '-s%d' % seed, fq1, frac_arg, '>', out1, ], [ 'seqtk', 'sample', '-s%d' % seed, fq2, frac_arg, '>', out2, ], ] elif input_reads == 'both': out1 = os.path.join(outdir, 'sample_1.fastq') out2 = os.path.join(outdir, 'sample_2.fastq') outU = os.path.join(outdir, 'sample_U.fastq') cmds = [ [ 'seqtk', 'sample', '-s%d' % seed, fq1, frac_arg, '>', out1, ], [ 'seqtk', 'sample', '-s%d' % seed, fq2, frac_arg, '>', out2, ], [ 'seqtk', 'sample', '-s%d' % seed, fqU, frac_arg, '>', outU, ], ] sysutils.command_runner(cmds, 'sample_reads', quiet, logfile, debug) return out1, out2, outU
def assemble_to_ref(qry_fa, ref_fa, outdir, pad_fh=None, quiet=False, logfile=None, debug=False): """ Args: qry_fa: ref_fa: outdir: pad_fh: quiet: logfile: debug: Returns: """ # Align query to reference fil, til = align_nucmer(qry_fa, ref_fa, outdir, quiet=quiet, logfile=logfile, debug=debug) if debug: return None # Parse tiling rows tr_byref = defaultdict(list) for l in open(til, 'rU'): tr = TilingRow(l) tr_byref[tr.ref].append(tr) # Load reference(s) refs = sorted(tr_byref.keys()) ref_dict = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')} sysutils.log_message('\nReferences: %s\n' % ', '.join(refs), quiet, logfile) scaffolds = {} for ref in refs: if pad_fh is not None: empty = EmptyReferenceAlignment(str(ref_dict[ref].seq).lower()) print('%s%s' % (ref.ljust(40), empty.rseq().upper()), file=pad_fh) scaffolds[ref] = EmptyReferenceAlignment( str(ref_dict[ref].seq).lower()) # Rank hits so that worst hit is in index 0 (best at the end) ranked = sorted(tr_byref[ref], key=lambda x: x.pid) ranked.sort(key=lambda x: x.qry_alen) for tr in ranked: out = show_aligns(tr.ref, tr.qry, fil) out = out.decode() # May be multiple alignments flag = False aln_reports = [] for l in out.strip('\n').split('\n'): if re.match('^--\s+BEGIN', l): aln_reports.append(list()) flag = True if flag: aln_reports[-1].append(l) if re.match('^--\s+END', l): flag = False for aln_report in aln_reports: # if debug: # print("*" * 80, file=sys.stderr) # print('\n'.join(aln_report), file=sys.stderr) # print("*" * 80, file=sys.stderr) nucaln = NucmerReferenceAlignment(aln_report) # print('%d-%d' % (nucaln.rstart, nucaln.rend)) if pad_fh is not None: pad = empty.merge_alignments(nucaln) print('%s%s' % (tr.qry.ljust(40), pad.padded()), file=pad_fh) scaffolds[ref] = scaffolds[ref].merge_alignments(nucaln) return scaffolds
def cliquesnv(fq1=None, fq2=None, fqU=None, ref_fa=None, outdir='.', jardir='.', O22min=None, O22minfreq=None, printlog=None, single=False, merging=None, fasta_format='extended4', outputstart=None, outputend=None, keep_tmp=False, quiet=False, logfile=None, debug=False, ncpu=1): # check if paired vs. single if fq1 is None and fq2 is None and fqU is not None: single = True # check dependencies and required arguments if fq1 is None and fq2 is None and fqU is None: raise MissingRequiredArgument("No fastq files given.") if single == False and (fq1 is None or fq2 is None): raise MissingRequiredArgument("Either fq1 or fq2 missing.") if ref_fa is None: raise MissingRequiredArgument("Reference FASTA missing.") sysutils.check_dependency('samtools') sysutils.check_dependency('bwa') if (os.path.isfile(os.path.join(jardir, "clique-snv.jar"))): print("CliqueSNV JAR file found.") else: raise MissingRequiredArgument("No JAR file found.") # Temporary directory tempdir = sysutils.create_tempdir('clique_snv', None, quiet, logfile) # Load reference fasta refs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')} # Identify reconstruction regions regions = [] for rname, s in refs.items(): regions.append(('cs%02d' % (len(regions) + 1), rname, 1, len(s))) sysutils.log_message('[--- Haplotype Reconstruction Regions ---]\n', quiet, logfile) for iv in regions: sysutils.log_message('%s -- %s:%d-%d\n' % iv, quiet, logfile) if single == False: #paired end # remove .1 and .2 from read names fq1_c = os.path.join(tempdir, "fq1_corrected.fastq") fq2_c = os.path.join(tempdir, "fq2_corrected.fastq") cmd01 = ["cat %s | sed 's/\.1 / /' > %s" % (fq1, fq1_c)] cmd02 = ["cat %s | sed 's/\.2 / /' > %s" % (fq2, fq2_c)] sysutils.command_runner([cmd01, cmd02], 'clique_snv:setup', quiet, logfile, debug) # Create alignment for each REFERENCE in the reconstruction regions alnmap = {} for cs, rname, spos, epos in regions: if rname not in alnmap: # Create alignment tmp_ref_fa = os.path.join(tempdir, 'ref.%d.fa' % len(alnmap)) tmp_sam = os.path.join(tempdir, 'aligned.%d.sam' % len(alnmap)) SeqIO.write(refs[rname], tmp_ref_fa, 'fasta') cmd1 = [ 'bwa', 'index', tmp_ref_fa, ] cmd2 = [ 'bwa', 'mem', tmp_ref_fa, fq1_c, fq2_c, '|', 'samtools', 'view', '-h', '-F', '12', '>', tmp_sam, ] cmd3 = ['rm', '-f', '%s.*' % tmp_ref_fa] sysutils.command_runner([cmd1, cmd2, cmd3], 'clique_snv:setup', quiet, logfile, debug) alnmap[rname] = (tmp_ref_fa, tmp_sam) else: #single read # Create alignment for each REFERENCE in the reconstruction regions alnmap = {} for cs, rname, spos, epos in regions: if rname not in alnmap: # Create alignment tmp_ref_fa = os.path.join(tempdir, 'ref.%d.fa' % len(alnmap)) tmp_sam = os.path.join(tempdir, 'aligned.%d.sam' % len(alnmap)) SeqIO.write(refs[rname], tmp_ref_fa, 'fasta') cmd1 = [ 'bwa', 'index', tmp_ref_fa, ] cmd2 = [ 'bwa', 'mem', tmp_ref_fa, fqU, '|', 'samtools', 'view', '-h', '-F', '12', '>', tmp_sam, ] cmd3 = ['rm', '-f', '%s.*' % tmp_ref_fa] sysutils.command_runner([cmd1, cmd2, cmd3], 'clique_snv:setup', quiet, logfile, debug) alnmap[rname] = (tmp_ref_fa, tmp_sam) # Run CliqueSNV for each region cmd4 = ['mkdir -p %s' % os.path.join(outdir, 'clique_snv')] sysutils.command_runner([ cmd4, ], stage='cliquesnv', quiet=quiet, logfile=logfile, debug=debug) i = 0 #index for filenames for cs, rname, spos, epos in regions: msg = "Reconstruction region %s:" % cs msg += " %s:%d-%d\n" % (rname, spos, epos) sysutils.log_message(msg, quiet, logfile) # rename the cliquesnv number (cs##) to include region (now: cs##_reg) cs = '%s_%s' % (cs, rname.split('|')[-2]) samfile = os.path.join(tempdir, 'aligned.%d.sam' % i) method = 'snv-illumina' cmd5 = [ 'java -jar %s -m %s -in %s -threads %d -outDir %s -fdf %s' % (os.path.join(jardir, 'clique-snv.jar'), method, samfile, ncpu, tempdir, fasta_format) ] if O22min is not None: cmd5 += ['-t %f' % O22min] if O22minfreq is not None: cmd5 += ['-tf %f' % O22minfreq] if printlog is not None: cmd5 += ['-log'] if merging is not None: cmd5 += ['-cm %s' % merging] if outputstart is not None: cmd5 += ['-os %d' % outputstart] if outputend is not None: cmd5 += ['-oe %d' % outputend] sysutils.command_runner([ cmd5, ], stage='clique_snv', quiet=quiet, logfile=logfile, debug=debug) # copy output file and delete tempdir outname1 = 'aligned.%d.txt' % i outname2 = 'aligned.%d.fasta' % i os.makedirs(os.path.join(outdir, 'clique_snv/%s' % cs), exist_ok=True) if os.path.exists(os.path.join(tempdir, '%s' % outname1)): shutil.copy( os.path.join(tempdir, '%s' % outname1), os.path.join(outdir, 'clique_snv/%s/%s.txt' % (cs, cs))) if os.path.exists(os.path.join(tempdir, '%s' % outname2)): shutil.copy( os.path.join(tempdir, '%s' % outname2), os.path.join(outdir, 'clique_snv/%s/%s.fasta' % (cs, cs))) # parse output file with open( os.path.join(outdir, 'clique_snv/%s/%s_summary.txt' % (cs, cs)), 'w') as sumfile, open( os.path.join(outdir, 'clique_snv/%s/%s.txt' % (cs, cs)), 'r') as infile: l = infile.readlines() freqs = [] haps = [] tempnum = '' for line in l: if "SNV got" in line: tempnum = line.split(' ')[2] if "frequency" in line: freqs += [float(line.split(' ')[2][:-2])] if "haplotype=" in line: haps += [line.split('=')[1][1:-2]] sumfile.write('CliqueSNV_num_hap\t%s\n' % tempnum) freq_sqrd = [x**2 for x in freqs] freq_sqrd_sum = sum(freq_sqrd) hap_div = ((old_div(7000, (7000 - 1))) * (1 - freq_sqrd_sum)) sumfile.write('CliqueSNV_hap_diversity\t%s\n' % hap_div) sumfile.write('CliqueSNV_seq_len\t%s\n' % len(haps[0])) with open(os.path.join(outdir, 'clique_snv/%s/%s.fasta' % (cs, cs)), 'r') as fastafile: fastadata = fastafile.read().replace('aligned.%d' % i, rname) with open( os.path.join(outdir, 'clique_snv/%s/%s.fasta' % (cs, cs)), 'w') as newfastafile: newfastafile.write(fastadata) i += 1 if not keep_tmp: sysutils.remove_tempdir(tempdir, 'clique_snv', quiet, logfile) return
def progressive_refine_assembly( fq1=None, fq2=None, fqU=None, ref_fa=None, outdir='.', max_step=None, subsample=None, seed=None, sample_id='sampleXX', ncpu=1, xmx=sysutils.get_java_heap_size(), keep_tmp=False, quiet=False, logfile=None, debug=False, ): # Outputs out_refined = os.path.join(outdir, 'refined.fna') out_bt2 = os.path.join(outdir, 'refined_bt2.out') out_summary = os.path.join(outdir, 'refined_summary.out') #--- Initialize cur_asm = ref_fa cur_alnrate = None assemblies = [OrderedDict(), ] for s in SeqIO.parse(cur_asm, 'fasta'): assemblies[-1][s.id] = s # Message log for summary summary = [ ['iteration', 'alnrate', 'diffs'] + ['diff:%s' % s for s in assemblies[0].keys()] ] # Seed random number generator random.seed(seed) for i in range(1, max_step+1): # Generate a refined assembly tmp_refined, tmp_bt2 = refine_assembly_step( fq1=fq1, fq2=fq2, fqU=fqU, ref_fa=cur_asm, outdir=outdir, iteration=i, subsample=subsample, sample_id=sample_id, ncpu=ncpu, xmx=xmx, keep_tmp=keep_tmp, quiet=True, logfile=logfile, debug=debug ) # Check whether alignments are different diffs = OrderedDict() new_seqs = OrderedDict((s.id, s) for s in SeqIO.parse(tmp_refined, 'fasta')) for id1, seq1 in new_seqs.items(): poss0 = [k for k in assemblies[-1].keys() if sequtils.seqid_match(id1, k)] if len(poss0) == 1: seq0 = assemblies[-1][poss0[0]] else: raise PipelineStepError("Could not match sequence %s" % id1) alns = pairwise2.align.globalms(seq1.seq, seq0.seq, 2, -1, -3, -1) d = min(sum(nc != cc for nc, cc in zip(t[0], t[1])) for t in alns) diffs[id1] = d total_diffs = sum(diffs.values()) # Check new alignment rate with open(tmp_bt2, 'rU') as fh: bt2str = fh.read() m = re.search('(\d+\.\d+)\% overall alignment rate', bt2str) if m is None: msg = "Alignment rate not found in bowtie2 output." msg += "Output file contents:\n%s\n" % bt2str msg += "Aborting." raise PipelineStepError(msg) else: new_alnrate = float(m.group(1)) # Create messages for log row = [str(i), '%.02f' % new_alnrate, '%d' % total_diffs, ] for k0 in assemblies[0].keys(): poss1 = [k for k in diffs.keys() if sequtils.seqid_match(k, k0)] if len(poss1) == 0: row.append('FAIL') elif len(poss1) == 1: row.append(str(diffs[poss1[0]])) else: raise PipelineStepError("Multiple matches for %s" % k0) ######row += list(map(str, diffs.values())) summary.append(row) # Create messages for console sysutils.log_message('\nRefinement result:\n', quiet, logfile) sysutils.log_message('\tDifferences:\n', quiet, logfile) for s,d in diffs.items(): sysutils.log_message('\t\t%s\t%d\n' % (s,d), quiet, logfile) if total_diffs > 0: msg = '\t%d differences found with previous\n' % total_diffs else: msg = '\tNo differences with previous\n' sysutils.log_message(msg, quiet, logfile) if cur_alnrate is None: msg = '\tAlignment rate: %0.2f\n' % new_alnrate elif new_alnrate > cur_alnrate: msg = '\tAlignment rate has improved: ' msg += '%.02f > %.02f\n' % (new_alnrate, cur_alnrate) else: msg = '\tAlignment rate has not improved: ' msg += '%.02f <= %.02f\n' % (new_alnrate, cur_alnrate) sysutils.log_message(msg, quiet, logfile) # Decide whether to keep going keep_going = True if total_diffs == 0: keep_going = False sysutils.log_message('Stopping: no differences found\n', quiet, logfile) # We should also quit if alignment rate does not improve # However, subsampling reads can lead to changes in alignment rate # that can be ignore. When subsampling is implemented the first # boolean value should check whether subsampling is enabled if subsample is None: # not subsampling if cur_alnrate is not None and new_alnrate <= cur_alnrate: keep_going = False msg = 'Stopping: alignment rate did not improve\n' sysutils.log_message(msg, quiet, logfile) cur_asm = tmp_refined cur_alnrate = new_alnrate assemblies.append(new_seqs) if not keep_going: break # Final outputs shutil.copy(cur_asm, out_refined) shutil.copy(tmp_bt2, out_bt2) with open(out_summary, 'w') as outh: print('\n'.join('\t'.join(r) for r in summary), file=outh) return out_refined, out_bt2, out_summary
def predict_haplo( fq1=None, fq2=None, ref_fa=None, region_txt=None, outdir='.', min_readlength=36, keep_tmp=False, quiet=False, logfile=None, debug=False, ): """ Pipeline step to assemble haplotypes Args: fq1 (str): Path to fastq file with read 1 fq2 (str): Path to fastq file with read 2 ref_fa (str): Path to reference fasta file region_txt (str): Path to region file outdir (str): Path to output directory min_readlength (int): Minimum readlength passed to PredictHaplo keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: best_fa (list): Path to best haplotype files (FASTA) """ # Check dependencies sysutils.check_dependency('PredictHaplo-Paired') sysutils.check_dependency('bwa') # Temporary directory tempdir = sysutils.create_tempdir('predict_haplo', None, quiet, logfile) # Load reference fasta refs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')} # Identify reconstruction regions regions = [] if region_txt: sysutils.log_message('Found regions file.\n', quiet, logfile) for l in open(region_txt, 'r'): rname, spos, epos = sequtils.region_to_tuple(l.strip()) if rname not in refs: raise PipelineStepError("ERROR: reference %s not valid" % rname) spos = 1 if spos is None else spos epos = len(refs[rname]) if epos is None else epos regions.append(('PH%02d' % (len(regions) + 1), rname, spos, epos)) else: for rname, s in refs.items(): regions.append(('PH%02d' % (len(regions) + 1), rname, 1, len(s))) sysutils.log_message('[--- Haplotype Reconstruction Regions ---]\n', quiet, logfile) for iv in regions: sysutils.log_message('%s -- %s:%d-%d\n' % iv, quiet, logfile) # Create alignment for each REFERENCE in the reconstruction regions alnmap = {} for ph, rname, spos, epos in regions: if rname not in alnmap: # Create alignment tmp_ref_fa = os.path.join(tempdir, 'ref.%d.fa' % len(alnmap)) tmp_sam = os.path.join(tempdir, 'aligned.%d.sam' % len(alnmap)) SeqIO.write(refs[rname], tmp_ref_fa, 'fasta') cmd1 = [ 'bwa', 'index', tmp_ref_fa, ] cmd2 = [ 'bwa', 'mem', tmp_ref_fa, fq1, fq2, '|', 'samtools', 'view', '-h', '-F', '12', '>', tmp_sam, ] cmd3 = ['rm', '-f', '%s.*' % tmp_ref_fa] sysutils.command_runner([cmd1, cmd2, cmd3], 'predict_haplo:setup', quiet, logfile, debug) alnmap[rname] = (tmp_ref_fa, tmp_sam) best_fa = [] # Run PredictHaplo for each REGION for ph, rname, spos, epos in regions: msg = "Reconstruction region %s:" % ph msg += " %s:%d-%d\n" % (rname, spos, epos) sysutils.log_message(msg, quiet, logfile) # Construct params specific for region reg_params = dict(DEFAULTS) reg_params['min_readlength'] = min_readlength reg_params['reconstruction_start'] = spos reg_params['reconstruction_stop'] = epos reg_params['prefix'] = '%s_out.' % ph # Lookup reference and alignment filename reg_params['ref_fasta'] = os.path.basename(alnmap[rname][0]) reg_params['alignment'] = os.path.basename(alnmap[rname][1]) # Create config file for region config_file = '%s.config' % ph with open(os.path.join(tempdir, config_file), 'w') as outh: tmpconfig = config_template % reg_params print(tmpconfig.replace('###', '%'), file=outh) try: # Run PredictHaplo cmd1 = [ 'cd', tempdir, ] cmd2 = [ 'PredictHaplo-Paired', config_file, '&>', '%s.log' % config_file ] sysutils.command_runner([ cmd1, cmd2, ], 'predict_haplo:%s' % ph, quiet, logfile, debug) # Copy files dest = os.path.join(outdir, ph) if not os.path.exists(dest): os.makedirs(dest) shutil.copy(os.path.join(tempdir, '%s.config.log' % ph), dest) for f in glob(os.path.join(tempdir, '%s_out*global*.fas' % ph)): shutil.copy(f, dest) for f in glob(os.path.join(tempdir, '%s_out*global*.html' % ph)): shutil.copy(f, dest) bf, bh = rename_best(dest, ph) best_fa.append((ph, bf)) except PipelineStepError as e: print(e, file=sys.stderr) if e.returncode == 139: print("PredictHaplo segfaulted", file=sys.stderr) best_fa.append((ph, None)) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'predict_haplo', quiet, logfile) return best_fa