def build_tree_NG(seqs=None, in_type='FASTA', output_name='hp_tree', outdir='.', treedir='hp_tree', model='GTR', bs_trees=None, outgroup=None,branch_length=None, consense=None,rand_tree=None, pars_tree=None, user_tree=None, search=None, search_1random=None, all=None, constraint_tree=None,bsconverge=None, bs_msa=None, bs_tree_cutoff=None,bs_metric=None, bootstrap=None, check=None, log=None, loglh=None, redo=None, terrace=None, seed=12345,version=None, quiet=False, logfile=None, debug=False, ncpu=1, keep_tmp=False): sysutils.check_dependency('raxml-ng') if version is True: cmd2 = ['raxml-ng', '-v'] sysutils.command_runner([cmd2], 'build_tree_NG', quiet, logfile, debug) return if seqs is None: msg = 'No alignment provided.' raise sysutils.PipelineStepError(msg) # Set Output Directory output_dir = os.path.join(outdir, treedir) cmd0 = ['mkdir -p %s' % output_dir] sysutils.command_runner([cmd0], 'build_tree_NG', quiet, logfile, debug) # fix seq names if in_type=='FASTA': check_name_compatibility(seqs,os.path.join(output_dir,'seqs_fixednames.fasta'),in_type) elif in_type=='PHYLIP': check_name_compatibility(seqs, os.path.join(output_dir, 'seqs_fixednames.phy'), in_type) # Create temporary directory tempdir = sysutils.create_tempdir('build_tree_NG', None, quiet, logfile) # start raxml command cmd1 = ['raxml-ng', '--prefix %s/%s' % (os.path.abspath(tempdir), output_name), '--threads %d' % ncpu, '--seed %d' % seed, '--model %s' % model] if seqs is not None: if in_type == 'FASTA': cmd1 += ['--msa', '%s' % os.path.join(output_dir,'seqs_fixednames.fasta')] elif in_type == 'PHYLIP': cmd1 += ['--msa', '%s' % os.path.join(output_dir, 'seqs_fixednames.phy')] if branch_length is not None: cmd1 += ['--brlen', '%s' % branch_length] if consense is not None: cmd1 += ['--consense', '%s' % consense] if pars_tree is not None and rand_tree is None: cmd1 += ['--tree pars{%d}' % pars_tree] if pars_tree is None and rand_tree is not None: cmd1 += ['--tree rand{%d}' % rand_tree] if pars_tree is not None and rand_tree is not None: cmd1 += ['--tree pars{%d},rand{%d}' % (pars_tree, rand_tree)] if user_tree is not None: cmd1 += ['--tree', '%s' % os.path.abspath(user_tree)] if search is True: cmd1 += ['--search'] if search_1random is True: cmd1 += ['--search1'] if all is True: cmd1 += ['--all'] if constraint_tree is not None: cmd1 += ['--tree-constraint', '%s' % os.path.abspath(constraint_tree)] if outgroup is not None: cmd1 += ['--outgroup', '%s' % outgroup] if bsconverge is True: cmd1 += ['--bsconverge'] if bs_msa is True: cmd1 += ['--bsmsa'] if bs_trees is not None: cmd1 += ['--bs-trees %s' % bs_trees] if bs_tree_cutoff is not None: cmd1 += ['--bs-cutoff', '%f' % bs_tree_cutoff] if bs_metric is not None: cmd1 += ['--bs-metric', '%s' % bs_metric] if bootstrap is True: cmd1 += ['--bootstrap'] if check is True: cmd1 += ['--check'] if log is not None: cmd1 += ['--log', '%s' % log] if loglh is True: cmd1 += ['--loglh'] if terrace is True: cmd1 += ['--terrace'] if redo is not None: cmd1 += ['--redo'] sysutils.command_runner([cmd1, ], 'build_tree_NG', quiet, logfile, debug) # copy files from tmpdir to output directory (note - took some out here) if os.path.exists(os.path.join(tempdir, '%s.raxml.bestTree' % output_name)): shutil.copy(os.path.join(tempdir, '%s.raxml.bestTree' % output_name), os.path.abspath(output_dir)) if os.path.exists(os.path.join(tempdir, '%s.raxml.bestPartitionTrees' % output_name)): shutil.copy(os.path.join(tempdir, '%s.raxml.bestPartitionTrees' % output_name), os.path.abspath(output_dir)) if os.path.exists(os.path.join(tempdir, '%s.raxml.bestModel' % output_name)): shutil.copy(os.path.join(tempdir, '%s.raxml.bestModel' % output_name), os.path.abspath(output_dir)) if os.path.exists(os.path.join(tempdir, '%s.raxml.bootstraps' % output_name)): shutil.copy(os.path.join(tempdir, '%s.raxml.bootstraps' % output_name), os.path.abspath(output_dir)) if os.path.exists(os.path.join(tempdir, '%s.raxml.bootstrapMSA.<REP>.phy' % output_name)): shutil.copy(os.path.join(tempdir, '%s.raxml.bootstrapMSA.<REP>.phy' % output_name), os.path.abspath(output_dir)) if os.path.exists(os.path.join(tempdir, '%s.raxml.ckp' % output_name)): shutil.copy(os.path.join(tempdir, '%s.raxml.ckp' % output_name), os.path.abspath(output_dir)) if os.path.exists(os.path.join(tempdir, '%s.raxml.consensusTree' % output_name)): shutil.copy(os.path.join(tempdir, '%s.raxml.consensusTree' % output_name), os.path.abspath(output_dir)) if os.path.exists(os.path.join(tempdir, '%s.raxml.log' % output_name)): shutil.copy(os.path.join(tempdir, '%s.raxml.log' % output_name), os.path.abspath(output_dir)) if os.path.exists(os.path.join(tempdir, '%s.raxml.mlTrees' % output_name)): shutil.copy(os.path.join(tempdir, '%s.raxml.mlTrees' % output_name), os.path.abspath(output_dir)) if os.path.exists(os.path.join(tempdir, '%s.raxml.startTree' % output_name)): shutil.copy(os.path.join(tempdir, '%s.raxml.startTree' % output_name), os.path.abspath(output_dir)) if os.path.exists(os.path.join(tempdir, '%s.raxml.support' % output_name)): shutil.copy(os.path.join(tempdir, '%s.raxml.support' % output_name), os.path.abspath(output_dir)) if os.path.exists(os.path.join(tempdir, '%s.raxml.terrace' % output_name)): shutil.copy(os.path.join(tempdir, '%s.raxml.terrace' % output_name), os.path.abspath(output_dir)) if os.path.exists(os.path.join(tempdir, '%s.raxml.terraceNewick' % output_name)): shutil.copy(os.path.join(tempdir, '%s.raxml.terraceNewick' % output_name), os.path.abspath(output_dir)) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'build_tree_NG', quiet, logfile) cmd6 = ['echo', 'Stage completed. Output files are located here: %s\n' % os.path.abspath(output_dir)] sysutils.command_runner([cmd6, ], 'build_tree_NG', quiet, logfile, debug)
def call_variants( aln_bam=None, ref_fa=None, outdir='.', emit_all=False, min_base_qual=15, ncpu=1, xmx=sysutils.get_java_heap_size(), keep_tmp=False, quiet=False, logfile=None, debug=False, ): """ Pipeline step to call variants Args: aln_bam (str): Path to alignment file (BAM) ref_fa (str): Path to reference fasta file outdir (str): Path to output directory emit_all (bool): Output calls for all sites min_base_qual (int): Minimum base quality for calling ncpu (int): Number of CPUs to use xmx (int): Maximum heap size for JVM in GB keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out_vcf (str): Path to output VCF """ # Check dependencies sysutils.check_dependency('samtools') sysutils.check_dependency('picard') # Identify correct command for GATK GATK_BIN = sysutils.determine_dependency_path(['gatk', 'gatk3']) # Set JVM heap argument (for GATK) JAVA_HEAP = '_JAVA_OPTIONS="-Xmx%dg"' % xmx # Outputs out_vcf = os.path.join(outdir, 'variants.vcf.gz') # Temporary directory tempdir = sysutils.create_tempdir('call_variants', None, quiet, logfile) # Copy and index initial reference curref = os.path.join(tempdir, 'initial.fasta') cmd1 = ['cp', ref_fa, curref] cmd2 = ['samtools', 'faidx', curref] cmd3 = [ 'picard', 'CreateSequenceDictionary', 'R=%s' % curref, 'O=%s' % os.path.join(tempdir, 'initial.dict') ] # UnifiedGenotyper cmd4 = [ JAVA_HEAP, GATK_BIN, '-T', 'UnifiedGenotyper', '--use_jdk_deflater', '--use_jdk_inflater', '--num_threads', '%d' % ncpu, '-gt_mode', 'DISCOVERY', '-glm', 'BOTH', '--baq', 'OFF', '--useOriginalQualities', '-dt', 'NONE', '--min_base_quality_score', '%d' % min_base_qual, '-ploidy', '4', '-I', aln_bam, '-R', curref, '-o', out_vcf, ] if emit_all: cmd4 += ['-out_mode', 'EMIT_ALL_SITES'] sysutils.command_runner([ cmd1, cmd2, cmd3, cmd4, ], 'call_variants:GATK', quiet, logfile, debug) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'call_variants:GATK', quiet, logfile) return out_vcf
def ec_reads( fq1=None, fq2=None, fqU=None, outdir='.', ncpu=1, keep_tmp=False, quiet=False, logfile=None, debug=False, ): """ Pipeline step to error-correct reads using spades Args: fq1 (str): Path to fastq file with read 1 fq2 (str): Path to fastq file with read 2 fqU (str): Path to fastq file with unpaired reads outdir (str): Path to output directory ncpu (int): Number of CPUs to use keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out1 (str): Path to corrected fastq file with read 1 out2 (str): Path to corrected fastq file with read 2 outU (str): Path to corrected fastq file with unpaired reads """ # Check inputs if fq1 is not None and fq2 is not None and fqU is None: input_reads = "paired" # Paired end elif fq1 is None and fq2 is None and fqU is not None: input_reads = "single" # Single end elif fq1 is not None and fq2 is not None and fqU is not None: input_reads = "both" else: msg = "incorrect input reads; requires either " msg += "(--fq1 AND --fq2) OR (--fqU) OR (--fq1 AND --fq2 AND --fqU)" raise MissingRequiredArgument(msg) # Check dependencies sysutils.check_dependency('spades.py') # Outputs out1 = os.path.join(outdir, 'corrected_1.fastq') out2 = os.path.join(outdir, 'corrected_2.fastq') outU = os.path.join(outdir, 'corrected_U.fastq') # Temporary directory tempdir = sysutils.create_tempdir('ec_reads', None, quiet, logfile) # spades command cmd1 = [ 'spades.py', '-o', tempdir, '-t', '%d' % ncpu, '--only-error-correction', ] if input_reads in [ 'paired', 'both', ]: cmd1 += [ '-1', os.path.abspath(fq1), '-2', os.path.abspath(fq2), ] if input_reads in [ 'single', 'both', ]: cmd1 += [ '-s', os.path.abspath(fqU), ] sysutils.command_runner([ cmd1, ], 'ec_reads', quiet, logfile, debug) # Copy files yaml_file = os.path.join(tempdir, 'corrected/corrected.yaml') if not os.path.exists(yaml_file): sysutils.PipelineStepError("YAML file %s not found" % yaml_file) with open(yaml_file, 'rU') as fh: d = yaml.load(fh, Loader=yaml.FullLoader)[0] cmds = [] if 'left reads' in d: cmds.append([ 'gunzip', '-c', ] + sorted(d['left reads']) + ['>', out1]) if 'right reads' in d: cmds.append([ 'gunzip', '-c', ] + sorted(d['right reads']) + ['>', out2]) if 'single reads' in d: cmds.append([ 'gunzip', '-c', ] + sorted(d['single reads']) + ['>', outU]) sysutils.command_runner(cmds, 'ec_reads', quiet, logfile, debug) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'ec_reads', quiet, logfile) return out1, out2, outU
def assemble_denovo_trinity(fq1=None, fq2=None, fqU=None, outdir='.', min_contig_length=200, subsample=None, seed=None, ncpu=1, keep_tmp=False, quiet=False, logfile=None, debug=False, **kwargs): """ Pipeline step to assemble reads using Trinity (denovo) Args: fq1 (str): Path to fastq file with read 1 fq2 (str): Path to fastq file with read 2 fqU (str): Path to fastq file with unpaired reads outdir (str): Path to output directory min_contig_length (int): minimum assembled contig length to report subsample (int): use a subsample of reads for assembly seed (int): Seed for random number generator ncpu (int): Number of CPUs to use keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run **kwargs: Not used. Returns: out1 (str): Path to assembled contigs file (fasta format) """ # Check inputs if fq1 is not None and fq2 is not None and fqU is None: input_reads = "paired" # Paired end elif fq1 is None and fq2 is None and fqU is not None: input_reads = "single" # Single end elif fq1 is not None and fq2 is not None and fqU is not None: input_reads = "both" else: msg = "incorrect input reads; requires either " msg += "(--fq1 AND --fq2) OR (--fqU) OR (--fq1 AND --fq2 AND --fqU)" raise MissingRequiredArgument(msg) # Check dependencies sysutils.check_dependency('Trinity') # Outputs out1 = os.path.join(outdir, 'contigs.fa') # Temporary directory tempdir = sysutils.create_tempdir('assemble_trinity', None, quiet, logfile) # Trinity command cmd1 = [ 'Trinity', '--min_contig_length', '%d' % min_contig_length, '--CPU', '%d' % ncpu, #'--max_memory', '%dG' % max_memory, '--seqType', 'fq', '--output', tempdir, ] if input_reads in [ 'paired', 'both', ]: cmd1 += [ '--left', os.path.abspath(fq1), '--right', os.path.abspath(fq2), ] elif input_reads in [ 'single', 'both', ]: cmd1 += [ '--single', os.path.abspath(fqU), ] # Copy command cmd2 = [ 'cp', os.path.join(tempdir, 'Trinity.fasta'), out1, ] sysutils.command_runner([ cmd1, cmd2, ], 'assemble_trinity', quiet, logfile, debug) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'assemble_trinity', quiet, logfile) if os.path.isfile(out1): with open(os.path.join(outdir, 'assembly_summary.txt'), 'w') as outh: sequtils.assembly_stats(open(out1, 'rU'), outh) return out1
def assemble_scaffold( contigs_fa=None, ref_fa=None, outdir='.', seqname='sample01', keep_tmp=False, quiet=False, logfile=None, debug=False ): """ Pipeline step to assemble contigs to reference scaffold Args: contigs_fa (str): Path to fasta file with assembled contigs ref_fa (str): Path to reference fasta file outdir (str): Path to output directory seqname (str): Name to append to scaffold sequence keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out_scaffold (str): Path to scaffold FASTA. Reference positions that were not covered have 'n' out_imputed (str): Path to imputed FASTA. Reference positions that were not covered have reference base. out_aln (str): Path to FASTA alignment between scaffold and reference. out_padded (str): Path to output with all contigs aligned to reference. """ # Check dependencies sysutils.check_dependency('nucmer') sysutils.check_dependency('delta-filter') sysutils.check_dependency('show-tiling') # Outputs out_scaffold = os.path.join(outdir, 'scaffold_assembly.fa') out_imputed = os.path.join(outdir, 'scaffold_imputed.fa') out_aln = os.path.join(outdir, 'scaffold_aligned.fa') out_padded = os.path.join(outdir, 'scaffold_padded.out') # Temporary directory tempdir = sysutils.create_tempdir( 'assemble_scaffold', None, quiet, logfile ) # Create fasta file with sequence IDs only (remove decription) tmp_contigs_fa = sequtils.clean_seqnames_file(contigs_fa, tempdir) with open(out_padded, 'w') as pad_fh: scaffolds = alignutils.assemble_to_ref( tmp_contigs_fa, ref_fa, tempdir, pad_fh=pad_fh, quiet=quiet, logfile=logfile, debug=debug ) # Output scaffolds as FASTA with open(out_scaffold, 'w') as outh: for ref in sorted(scaffolds.keys()): n = '%s.%s' % (ref.split('.')[0], seqname) s = scaffolds[ref].scaffold() print('>%s\n%s' % (n, sequtils.wrap(s)), file=outh) # Output imputed as FASTA with open(out_imputed, 'w') as outh: for ref in sorted(scaffolds.keys()): n = '%s.%s' % (ref.split('.')[0], seqname) s = scaffolds[ref].imputed() print('>%s\n%s' % (n, sequtils.wrap(s)), file=outh) # Output alignments for other pipeline stages with open(out_aln, 'w') as outh: for ref in sorted(scaffolds.keys()): n = '%s.%s' % (ref.split('.')[0], seqname) print('>REF|%s\n%s' % (n, scaffolds[ref].raln()), file=outh) print('>%s\n%s' % (n, scaffolds[ref].qaln()), file=outh) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'assemble_scaffold', quiet, logfile) return out_scaffold, out_imputed, out_aln, out_padded
def trim_reads( fq1=None, fq2=None, fqU=None, outdir=".", adapter_file=None, trimmers=TRIMMERS, encoding=None, ncpu=1, quiet=False, logfile=None, debug=False, ): """ Pipeline step to trim reads Args: fq1 (str): Path to fastq file with read 1 fq2 (str): Path to fastq file with read 2 fqU (str): Path to fastq file with unpaired reads outdir (str): Path to output directory adapter_file (str): Path to adapter file (fasta) trimmers (`list` of `str`): Trim commands for trimmomatic encoding (str): Quality score encoding ncpu (int): Number of CPUs to use quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out1 (str): Path to trimmed fastq file with read 1 out2 (str): Path to trimmed fastq file with read 2 outU (str): Path to trimmed fastq file with unpaired reads out_summary (str): Path to summary file """ # Check inputs if fq1 is not None and fq2 is not None and fqU is None: input_reads = "paired" # Paired end elif fq1 is None and fq2 is None and fqU is not None: input_reads = "single" # Single end else: msg = "incorrect input reads; requires either " msg += "(--fq1 and --fq2) OR (--fqU)" raise MissingRequiredArgument(msg) """ There are two different ways to call Trimmomatic. If using modules on C1, the path to the jar file is stored in the "$Trimmomatic" environment variable. Otherwise, if using conda, the "trimmomatic" script is in PATH. """ # Check dependencies try: sysutils.check_dependency('trimmomatic') cmd1 = ['trimmomatic'] except PipelineStepError as e: if 'Trimmomatic' in os.environ: cmd1 = ['java', '-jar', '$Trimmomatic'] else: raise e # Get encoding if encoding is None: if input_reads == 'single': encoding = helpers.guess_encoding(fqU) else: encoding = helpers.guess_encoding(fq1) # Outputs for both single and paired out_summary = os.path.join(outdir, 'trimmomatic_summary.out') outU = os.path.join(outdir, 'trimmed_U.fastq') if input_reads is 'single': # Outputs out1 = out2 = None # Trimmomatic command cmd1 += [ 'SE', '-threads', '%d' % ncpu, '-phred33' if encoding == "Phred+33" else '-phred64', '-summary', out_summary, fqU, outU, ] # Specify trimming steps if adapter_file is not None: adapter_file = adapter_file.replace('PE', 'SE') cmd1.append("ILLUMINACLIP:%s:2:30:10" % adapter_file) cmd1 += trimmers # Run command sysutils.command_runner([ cmd1, ], 'trim_reads', quiet, logfile, debug) return out1, out2, outU elif input_reads is 'paired': # Outputs out1 = os.path.join(outdir, 'trimmed_1.fastq') out2 = os.path.join(outdir, 'trimmed_2.fastq') tmp1U = os.path.join(outdir, 'tmp1U.fq') tmp2U = os.path.join(outdir, 'tmp2U.fq') # Trimmomatic command cmd1 += [ 'PE', '-threads', '%d' % ncpu, '-phred33' if encoding == "Phred+33" else '-phred64', '-summary', out_summary, fq1, fq2, out1, tmp1U, out2, tmp2U, ] # Specify trimming steps if adapter_file is not None: cmd1.append("ILLUMINACLIP:%s:2:30:10" % adapter_file) cmd1 += trimmers # Concat files command cmd2 = [ 'cat', tmp1U, tmp2U, '>>', outU, ] cmd3 = [ 'rm', '-f', tmp1U, tmp2U, ] # Run commands sysutils.command_runner([ cmd1, cmd2, cmd3, ], 'trim_reads', quiet, logfile, debug) return out1, out2, outU, out_summary
def build_tree(seqs=None, data_type='NUC', run_full_analysis=None, output_name='build_tree.tre', outdir='.', treedir='hp_build_tree', model='GTRGAMMAIX', outgroup=None, parsimony_seed=1234, wgtFile=None, secsub=None, bootstrap=None, bootstrap_threshold=None, numCat=None, rand_starting_tree=None, convergence_criterion=None, likelihoodEpsilon=None, excludeFileName=None, algo_option=None, cat_model=None, groupingFile=None, placementThreshold=None, disable_pattern_compression=None, InitialRearrangement=None, posteriori=None, print_intermediate_trees=None, majorityrule=None, print_branch_length=None, ICTCmetrics=None, partition_branch_length=None, disable_check=None, AAmodel=None, multiplemodelFile=None, binarytree=None, BinaryParameterFile=None, SecondaryStructure=None, UserStartingTree=None, median_GAMMA=None, version_info=None, rate_heterogeneity=None, window=None, RapidBootstrapNumSeed=None, random_addition=None, starting_tree=None, quartetGroupingFileName=None, multipleTreeFile=None, NumberofRuns=None, mesquite=None, silent=None, noseqcheck=None, nobfgs=None, epaPlaceNum=None, epaProbThreshold=None, epaLikelihood=None, HKY85=None, BootstrapPerm=None, quiet=False, logfile=None, debug=False, keep_tmp=False, option_help=None): # Check dependencies sysutils.check_dependency('raxmlHPC') cmd1 = [] # check for required input if option_help is True: cmd4 = ['raxmlHPC', '-h'] sysutils.command_runner([cmd4], 'build_tree', quiet, logfile, debug) return if version_info is True: cmd5 = ['raxmlHPC', '-v'] sysutils.command_runner([cmd5], 'build_tree', quiet, logfile, debug) if seqs is None and option_help is None: msg = 'No alignment provided' raise sysutils.PipelineStepError(msg) # check model compatibility if data_type is not 'AA' and 'PROT' in model: msg = 'Protein model given for non-amino acid data' raise sysutils.PipelineStepError(msg) if data_type is not 'MULTI' and 'MULTI' in model: msg = 'Multi-state model given for non-multi-state data' raise sysutils.PipelineStepError(msg) if data_type is not 'BIN' and 'BIN' in model: msg = 'Binary model given for non-binary data' raise sysutils.PipelineStepError(msg) if data_type is not 'NUC': if data_type not in model: msg = 'model and data type not compatible' raise sysutils.PipelineStepError(msg) # Set Output Directory output_dir = os.path.join(outdir, treedir) cmd0 = ['mkdir -p %s' % output_dir] sysutils.command_runner([cmd0], 'build_tree', quiet, logfile, debug) # Temporary directory tempdir = sysutils.create_tempdir('build_tree', None, quiet, logfile) if run_full_analysis is True: # generate seeds seed1 = random.randint(10000, 99999) seed2 = random.randint(10000, 99999) cmd1 = [ 'echo', 'Using parsimony seed %s and bootstrap seed %s' % (seed1, seed2) ] sysutils.command_runner([cmd1], 'build_tree', quiet, logfile, debug) # run raxml cmd2 = [ 'raxmlHPC', '-w %s' % os.path.abspath(tempdir), '-f a', '-p %d' % seed1, '-x %d' % seed2, '-# 100', '-m %s' % model, '-s %s' % os.path.abspath(seqs), '-n %s' % output_name ] sysutils.command_runner([cmd2], 'build_tree', quiet, logfile, debug) else: # start raxml command cmd1 = [ 'raxmlHPC', '-w %s' % os.path.abspath(tempdir), '-p %d' % parsimony_seed, '-m %s' % model ] if outgroup is not None: cmd1 += ['-o', '%s' % outgroup] if wgtFile is not None: cmd1 += ['-a', '%s' % os.path.join('.', wgtFile)] if secsub is not None and SecondaryStructure is not None: cmd1 += ['-A', '%s' % secsub] cmd1 += ['-S', '%s' % os.path.join('.', SecondaryStructure)] elif secsub is not None and SecondaryStructure is None: msg = 'Need to specify a file defining the secondary structure via the S option' raise sysutils.PipelineStepError(msg) if bootstrap is not None: cmd1 += ['-b', '%d' % bootstrap] if bootstrap_threshold is not None: cmd1 += ['-B', '%f' % bootstrap_threshold] if numCat is not None: cmd1 += ['-c', '%d' % numCat] if rand_starting_tree is True: cmd1 += ['-d'] if convergence_criterion is True: cmd1 += ['-D'] if likelihoodEpsilon is not None: cmd1 += ['-e', '%f' % likelihoodEpsilon] if excludeFileName is not None: cmd1 += ['-E', '%s' % os.path.join('.', excludeFileName)] if algo_option is not None and algo_option in [ 'a', 'A', 'b', 'B', 'c', 'C', 'd', 'D', 'e', 'E', 'F', 'g', 'G', 'h', 'H', 'i', 'I', 'j', 'J', 'k', 'm', 'n', 'N', 'o', 'p', 'q', 'r', 'R', 's', 'S', 't', 'T', 'U', 'v', 'V', 'w', 'W', 'x', 'y' ]: cmd1 += ['-f', '%s' % algo_option] if cat_model is True: cmd1 += ['-F'] if groupingFile is not None: cmd1 += ['-g', '%s' % os.path.join('.', groupingFile)] if placementThreshold is not None: cmd1 += ['-G', '%f' % placementThreshold] if disable_pattern_compression is True: cmd1 += ['-H'] if InitialRearrangement is not None: cmd1 += ['-i', '%d' % InitialRearrangement] if posteriori is not None: cmd1 += ['-I', '%s' % posteriori] if print_intermediate_trees is True: cmd1 += ['-j'] if (majorityrule is not None) and (multipleTreeFile is not None): cmd1 += ['-J', '%s' % majorityrule] cmd1 += ['-z', '%s' % os.path.join('.', multipleTreeFile)] elif majorityrule is not None and multipleTreeFile is None: msg = 'Need to provide a tree file containing several UNROOTED trees via the z option' raise sysutils.PipelineStepError(msg) if print_branch_length is True: cmd1 += ['-k'] if ICTCmetrics is not None: cmd1 += ['-L', '%s' % ICTCmetrics] if partition_branch_length is True: cmd1 += ['-M'] if disable_check is True: cmd1 += ['-O'] if AAmodel is not None: cmd1 += ['-P', '%s' % os.path.join('.', AAmodel)] if multiplemodelFile is not None: cmd1 += ['-q', '%s' % os.path.join('.', multiplemodelFile)] if binarytree is not None: cmd1 += ['-r', '%s' % os.path.join('.', binarytree)] if BinaryParameterFile is not None: cmd1 += ['-R', '%s' % os.path.join('.', BinaryParameterFile)] if SecondaryStructure is not None: cmd1 += ['-S', '%s' % os.path.join('.', SecondaryStructure)] if UserStartingTree is not None: cmd1 += ['-t', '%s' % os.path.join('.', UserStartingTree)] if median_GAMMA is True: cmd1 += ['-u'] if rate_heterogeneity is True: cmd1 += ['-V'] if window is not None: cmd1 += ['-W', '%d' % window] if RapidBootstrapNumSeed is not None: cmd1 += ['-x', '%d' % RapidBootstrapNumSeed] if random_addition is True: cmd1 += ['-X'] if starting_tree is True: cmd1 += ['-y'] if quartetGroupingFileName is not None: cmd1 += ['-Y', '%s' % os.path.join('.', quartetGroupingFileName)] if multipleTreeFile is not None: cmd1 += ['-z', '%s' % os.path.join('.', multipleTreeFile)] if NumberofRuns is not None: cmd1 += ['-N', '%d' % NumberofRuns] if mesquite is True: cmd1 += ['--mesquite'] if silent is True: cmd1 += ['--silent'] if noseqcheck is True: cmd1 += ['--no-seq-check'] if nobfgs is True: cmd1 += ['--no-bfgs'] if epaPlaceNum is not None: cmd1 += ['epakeepplacements=%d' % epaPlaceNum] if epaProbThreshold is not None: cmd1 += ['epaprobthreshold=%f' % epaProbThreshold] if epaLikelihood is not None: cmd1 += ['epaaccumulatedthreshold=%f' % epaLikelihood] if HKY85 is True: cmd1 += ['--HKY85'] if BootstrapPerm is not None: cmd1 += ['[bootstopperms=%s' % BootstrapPerm] if option_help is True: cmd1 += ['-h'] cmd1 += ['-s', '%s' % os.path.abspath(seqs), '-n', '%s' % output_name] sysutils.command_runner([ cmd1, ], 'build_tree', quiet, logfile, debug) # copy files from tmpdir to output directory if os.path.exists(os.path.join(tempdir, 'RAxML_bestTree.%s' % output_name)): shutil.copy(os.path.join(tempdir, 'RAxML_bestTree.%s' % output_name), os.path.abspath(output_dir)) if os.path.exists(os.path.join(tempdir, 'RAxML_info.%s' % output_name)): shutil.copy(os.path.join(tempdir, 'RAxML_info.%s' % output_name), os.path.abspath(output_dir)) if os.path.exists( os.path.join(tempdir, 'RAxML_perSiteLLs.%s' % output_name)): shutil.copy(os.path.join(tempdir, 'RAxML_perSiteLLs.%s' % output_name), os.path.abspath(output_dir)) if os.path.exists( os.path.join(tempdir, 'RAxML_bipartitionFrequencies.%s' % output_name)): shutil.copy( os.path.join(tempdir, 'RAxML_bipartitionFrequencies.%s' % output_name), os.path.abspath(output_dir)) if os.path.exists( os.path.join(tempdir, 'RAxML_bipartitionsBranchLabels.%s' % output_name)): shutil.copy( os.path.join(tempdir, 'RAxML_bipartitionsBranchLabels.%s' % output_name), os.path.abspath(output_dir)) if os.path.exists( os.path.join(tempdir, 'RAxML_bipartitions.%s' % output_name)): shutil.copy( os.path.join(tempdir, 'RAxML_bipartitions.%s' % output_name), os.path.abspath(output_dir)) if os.path.exists(os.path.join(tempdir, 'RAxML_bootstrap.%s' % output_name)): shutil.copy(os.path.join(tempdir, 'RAxML_bootstrap.%s' % output_name), os.path.abspath(output_dir)) if os.path.exists( os.path.join(tempdir, 'RAxML_checkpoint.%s' % output_name)): shutil.copy(os.path.join(tempdir, 'RAxML_checkpoint.%s' % output_name), os.path.abspath(output_dir)) if os.path.exists( os.path.join(tempdir, 'RAxML_randomTree.%s' % output_name)): shutil.copy(os.path.join(tempdir, 'RAxML_randomTree.%s' % output_name), os.path.abspath(output_dir)) if os.path.exists( os.path.join(tempdir, 'RAxML_parsimonyTree.%s' % output_name)): shutil.copy( os.path.join(tempdir, 'RAxML_parsimonyTree.%s' % output_name), os.path.abspath(output_dir)) if os.path.exists(os.path.join(tempdir, 'RAxML_result.%s' % output_name)): shutil.copy(os.path.join(tempdir, 'RAxML_result.%s' % output_name), os.path.abspath(output_dir)) if os.path.exists(os.path.join(tempdir, 'RAxML_log.%s' % output_name)): shutil.copy(os.path.join(tempdir, 'RAxML_log.%s' % output_name), os.path.abspath(output_dir)) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'build_tree', quiet, logfile) cmd3 = [ 'echo', 'Stage completed. Output files are located here: %s\n' % os.path.abspath(output_dir) ] sysutils.command_runner([ cmd3, ], 'build_tree', quiet, logfile, debug)
def summary_stats(dir_list=None, ph_list=None, quiet=False, logfile=None, debug=False, amplicons=False, outdir='.'): # check for samtools sysutils.check_dependency('samtools') # check for dir_list (required) if dir_list is not None: f = open(dir_list, 'r') filenames = f.read().splitlines() else: msg = 'no directory list given' raise MissingRequiredArgument(msg) # count number of samples numsamps = 0 for f in filenames: if len(f) > 0: numsamps += 1 # count number of PH files numph = 0 if ph_list is not None: p = open(ph_list, 'r') phnames = p.read().splitlines() for f in phnames: if len(f) > 0: numph += 1 tsv_header = [] tsv_samps = [] with open(os.path.join(outdir, 'summary_stats.txt'), 'w') as outfile: for i in range(numsamps): # for each sample # set file names bowtiefile = os.path.join(filenames[i], 'final_bt2.out') trimfile = os.path.join(filenames[i], 'trimmomatic_summary.out') bamfile = os.path.join(filenames[i], 'final.bam') outidxstat = os.path.join(filenames[i], 'final.idxstat.txt') finalfina = os.path.join(filenames[i], 'final.fna') vcfzipped = os.path.join(filenames[i], 'final.vcf.gz') vcfunzipped = os.path.join(filenames[i], 'final.vcf') sampname = str(filenames[i]) num_cols = sampname.count('/') + 1 if i == 0: # if the first iteration, create tsv_header for x in range(num_cols): tsv_header += ['dir_%s' % str(x)] tsv_header += ['RAW', 'CLEAN', 'ALN_RATE'] # output block 1 outfile.write("SAMPLE " + "%s:\n" % sampname) outfile.write("\t Directory: %s\n" % str(os.path.abspath(filenames[i]))) raw = search_file(trimfile, "Input Read Pairs").split(' ')[3] outfile.write("\t Number of raw read pairs: %s\n" % raw) cleaned = search_file(bowtiefile, "reads;").split(' ')[0] outfile.write("\t Number of cleaned read pairs: %s\n" % cleaned) aln_rate = search_file(bowtiefile, "overall alignment rate").split(' ')[0] outfile.write("\t Overall alignment rate: %s\n" % aln_rate) # create tsv line tsv_samp_temp = [] tsv_samp_temp += sampname.split('/') tsv_samp_temp += [str(raw), str(cleaned), str(aln_rate)] # index bam file with samtools cmd0 = ["samtools index %s" % bamfile] sysutils.command_runner([cmd0, ], 'summary_stats', quiet, logfile, debug) # run idxstats with samtools cmd1 = ["samtools idxstats %s > %s" % (bamfile, outidxstat)] sysutils.command_runner([cmd1, ], 'summary_stats', quiet, logfile, debug) # unzip vcf file if os.path.isfile(vcfzipped): cmd2 = ["gunzip %s" % vcfzipped] sysutils.command_runner([cmd2, ], 'summary_stats', quiet, logfile, debug) # if amplicon assembly if amplicons is True: all_amplicons = [] for record in SeqIO.parse(finalfina, 'fasta'): reg_short = record.name.split('|')[5] all_amplicons.append(str(reg_short)) # parse outidxstat and output outfile.write("\t\t Amplicon %s:\n" % reg_short) leng = search_file(outidxstat, reg_short).split('\t')[1] outfile.write("\t\t\t Amplicon length: %s\n" % leng) count = search_file(outidxstat, reg_short).split('\t')[2] outfile.write("\t\t\t Amplicon read count: %s\n" % count) # run depth with samtools for coverage dep = os.path.join(filenames[i], 'final.depth.%s.txt' % reg_short) cmd3 = ["samtools depth -r '%s' %s > %s" % (str(record.name), bamfile, dep)] sysutils.command_runner([cmd3, ], 'summary_stats', quiet, logfile, debug) # parse dep file from samtools lines = 0 with open(dep) as depfile: for line in depfile: if len(line) > 0: lines += 1 perc = (lines / int(leng)) * 100 # output coverage outfile.write("\t\t\t Amplicon coverage: %s (%s percent)\n" % (lines, perc)) snps = parse_vcf_file(vcfunzipped, reg_short) outfile.write("\t\t\t Number of SNPS: %s\n" % snps) theta = float(snps) / float(leng) outfile.write("\t\t\t Theta: %1.5f\n\n" % theta) # add to tsv line tsv_samp_temp += [str(leng), str(count), str(lines), str(perc), str(snps), str(theta)] # add line to list for tsv tsv_samps += [tsv_samp_temp] # HAPLOTYPE FILES if numph > 0: outfile.write("\n\nHAPLPOTYPE SUMMARY STATISTICS\n\n") ph_tsv_header = [] ph_tsv_samps = [] for i in range(numph): # for each PH directory phfile = os.path.join(phnames[i], 'ph_summary.txt') num_cols = phnames[i].count('/') + 1 if i == 0: # if the first iteration, create ph_tsv_header for x in range(num_cols): ph_tsv_header += ['dir_%s' % str(x)] ph_tsv_header += ['PH_NUM_HAP', 'PH_HAP_DIVERSITY', 'PH_SEQ_LEN'] # output from parsing phfile outfile.write("PH OUTPUT FILE %s:\n" % phfile) num_hap = search_file(phfile, "PH_num_hap").split(' ')[1] outfile.write("\t Number of haplotypes: %s\n" % num_hap) div = search_file(phfile, "PH_hap_diversity").split(' ')[1] outfile.write("\t Haplotype diversity: %s\n" % div) seq_len = search_file(phfile, "PH_seq_len").split(' ')[1] outfile.write("\t Sequence length: %s\n" % seq_len) # create tsv line ph_tsv_samp_temp = [] ph_tsv_samp_temp += phnames[i].split('/') ph_tsv_samp_temp += [str(num_hap), str(div), str(seq_len)] # add line to ph_tsv_samps ph_tsv_samps += [ph_tsv_samp_temp] # make summary_stats.tsv file with open(os.path.join(outdir, 'summary_stats.tsv'), 'w') as outfile: if amplicons is True: for amp in all_amplicons: tsv_header += ['%s_LEN' % amp, '%s_RC' % amp, '%s_COV_NUM' % amp, '%s_COV_PERC' % amp, '%s_SNPS' % amp, '%s_THETA' % amp] outfile.write(('\t').join(tsv_header) + '\n') for samp in tsv_samps: outfile.write(('\t').join(samp) + '\n') # make PH_summary_stats.tsv file if ph_list is not None: with open(os.path.join(outdir, 'PH_summary_stats.tsv'), 'w') as outfile: outfile.write(('\t').join(ph_tsv_header) + '\n') for samp in ph_tsv_samps: outfile.write(('\t').join(samp) + '\n') # ending summary message cmd3 = ['echo', 'Stage completed. Summary stats are located here: %s\n' % os.path.abspath('summary_stats.txt')] if amplicons is True: cmd3 += ['echo', 'Amplicons: %s\n' % (', ').join(all_amplicons)] sysutils.command_runner([cmd3, ], 'summary_stats', quiet, logfile, debug)
def run_mafft(inputseqs=None, out_align="alignment.fasta", auto=None, algo=None, sixmerpair=None, globalpair=None, localpair=None, genafpair=None, fastapair=None, weighti=None, retree=None, maxiterate=None, noscore=None, memsave=None, parttree=None, dpparttree=None, fastaparttree=None, partsize=None, groupsize=None, lop=None, lep=None, lexp=None, LOP=None, LEXP=None, bl=None, jtt=None, tm=None, aamatrix=None, fmodel=None, clustalout=None, inputorder=None, reorder=None, treeout=None, quiet_mafft=None, nuc=None, amino=None, quiet=False, logfile=None, debug=False, ncpu=1, msadir='.', phylipout=None): ### function to run MAFFT ### sysutils.check_dependency('mafft') ## create MAFFT command using input options if algo is None: cmd1 = [ 'mafft', '--thread', '%d' % ncpu, ] else: if algo not in [ 'linsi', 'ginsi', 'einsi', 'fftnsi', 'fftns', 'nwns', 'nwnsi' ]: msg = 'Algorithm not in MAFFT' raise sysutils.PipelineStepError(msg) else: cmd1 = ['%s' % algo] if clustalout is True: cmd1 += ['--clustalout'] if inputorder is True: cmd1 += ['--inputourder'] if reorder is True: cmd1 += ['--reorder'] if treeout is True: cmd1 += ['--treeout'] if quiet_mafft is True: cmd1 += ['--quiet'] if nuc is True: cmd1 += ['--nuc'] if amino is True: cmd1 += ['--amino'] ### algorithm options if auto is True: cmd1 += ['--auto'] if sixmerpair is True: cmd1 += ['--6merpair'] if globalpair is True: cmd1 += ['--globalpair'] if localpair is True: cmd1 += ['--localpair'] if genafpair is True: cmd1 += ['--genafpair'] if fastapair is True: cmd1 += ['--fastapair'] if weighti is not None: cmd1 += ['--weighti', '%f' % weighti] if retree is not None: cmd1 += ['--retree', '%d' % retree] if maxiterate is not None: cmd1 += ['--maxiterate', '%d' % maxiterate] if noscore is True: cmd1 += ['--noscore'] if memsave is True: cmd1 += ['--memsave'] if parttree is True: cmd1 += ['--parttree'] if dpparttree is True: cmd1 += ['--dpparttree'] if fastaparttree is True: cmd1 += ['--fastaparttree'] if partsize is not None: cmd1 += ['--partsize', '%d' % partsize] if groupsize is not None: cmd1 += ['--groupsize', '%d' % groupsize] ### parameters if lop is not None: cmd1 += ['--lop', '%f' % lop] if lep is not None: cmd1 += ['--lep', '%f' % lep] if lexp is not None: cmd1 += ['--lexp', '%f' % lexp] if LOP is not None: cmd1 += ['--LOP', '%f' % LOP] if LEXP is not None: cmd1 += ['--LEXP', '%f' % LEXP] if bl is not None: cmd1 += ['--bl', '%d' % bl] if jtt is not None: cmd1 += ['--jtt', '%d' % jtt] if tm is not None: cmd1 += ['--tm', '%d' % tm] if aamatrix is not None: cmd1 += ['--aamatrix', '%s' % aamatrix] if fmodel is True: cmd1 += ['--fmodel'] # Outputs outName = os.path.join(msadir, '%s' % os.path.basename(out_align)) ## create command cmd1 += ['%s' % inputseqs, '>', '%s' % outName] ## run MAFFT command sysutils.command_runner([ cmd1, ], 'multiple_align', quiet, logfile, debug) if phylipout is True: phyout = outName[:-6] + '.phy' SeqIO.convert( outName, 'fasta', phyout, 'phylip-relaxed') # relaxed allows for long sequence names cmd2 = ['echo', 'Output converted to PHYLIP format from FASTA format.'] sysutils.command_runner([ cmd2, ], 'multiple_align', quiet, logfile, debug) if clustalout is True: clustout = outName[:-6] + '.aln' cmd3 = ['mv', outName, clustout] sysutils.command_runner([ cmd3, ], 'multiple_align', quiet, logfile, debug) cmd4 = ['echo', 'Alignment output is in CLUSTAL format.'] sysutils.command_runner([ cmd4, ], 'multiple_align', quiet, logfile, debug) return
def pairwise_align( amplicons_fa=None, ref_fa=None, ref_gtf=None, outdir='.', keep_tmp=False, quiet=False, logfile=None, debug=False, ): """ Pipeline step to align amplicons to reference Args: amplicons_fa (str): Path to fasta file with amplicon sequences ref_fa (str): Path to reference fasta file ref_gtf (str): Path to reference GTF file with amplicons outdir (str): Path to output directory keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out_aln (str): Path to alignment in JSON format """ # Check dependencies sysutils.check_dependency('blastx') # Outputs out_aln = os.path.join(outdir, 'alignments.json') # Temporary directory tempdir = sysutils.create_tempdir('pairwise_align', None, quiet, logfile) # Load reference sequence(s) refseqs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')} # Load amplicons from GTF file amps = [ gl for gl in gtfparse.gtf_parser(ref_gtf) if gl.feature == 'amplicon' ] ampdict = {(gl.chrom, gl.attrs['name']): gl for gl in amps} out_json = { 'aa_alignments': {}, 'nuc_alignments': {}, 'padded_alignments': {}, 'padded_gtf': [], } # {(sid, ref): [(reg, list(alignment)), ...], ...} all_nuc_aln = defaultdict(list) for amprec in SeqIO.parse(amplicons_fa, 'fasta'): # Get amplicon reference and region from sequence ID aid = sequtils.parse_seq_id(amprec.id) # Find the GTF line used to orient this amplicon try: gl = ampdict[(aid['ref'], aid['reg'])] except KeyError: poss_gl = [t for t in ampdict.keys() if t[1] == aid['reg']] gl = ampdict[poss_gl[0]] # Start and stop for primary coding region pri_s = int(gl.attrs['primary_cds'].split('-')[0]) - 1 pri_e = int(gl.attrs['primary_cds'].split('-')[1]) # Start and stop for additional coding regions altcds = [] if 'alt_cds' in gl.attrs: for x in gl.attrs['alt_cds'].split(','): altcds.append( ((int(x.split('-')[0]) - 1), int(x.split('-')[1]))) # Align using amino acids refseq = matching_refseq(refseqs, aid['ref']) alnobj, nuc_aln = baln.alignAA(refseq, amprec, (pri_s, pri_e), altcds, tempdir, quiet) # prialn is a BlastxAlignment object with amplicon aligned to primary cds # merged is a nucleotide alignment over the full amplicon, with unaligned regions # aligned using alternate cds or nucleotide alignments all_nuc_aln[(aid['sid'], aid['ref'])].append((aid['reg'], nuc_aln)) jid = 'sid|%s|ref|%s|reg|%s|' % (aid['sid'], aid['ref'], aid['reg']) out_json['aa_alignments'][jid] = alnobj.aa_align out_json['nuc_alignments'][jid] = nuc_aln # Full sequence with padding for sid, ref in list(all_nuc_aln.keys()): _refseq = matching_refseq(refseqs, ref) # New name and new alignment newname = 'sid|%s|ref|%s|' % (sid, _refseq.id) tmp = [] # Sort all segments by the start position segments = sorted(all_nuc_aln[(sid, ref)], key=lambda x: x[1][0][0]) rpos = qpos = 0 for sname, seg in segments: gr = GTFRow() gr.chrom, gr.source, gr.feature = (newname, 'haphpipe', 'amplicon') gr.score, gr.strand, gr.frame = ('.', '+', '.') gr.attrs['name'] = sname # Pad up to first position of segment if rpos < seg[0][0]: for p in range(rpos, seg[0][0]): tmp.append((p, str(_refseq.seq[p]), '*', qpos)) qpos += 1 gr.start = qpos + 1 for t in seg: if t[3] == -1: tmp.append(t) else: tmp.append((t[0], t[1], t[2], qpos)) qpos += 1 # Add annotation line gr.end = qpos # Include statistics in attributes gr.attrs.update(baln.get_seg_stats(seg)) # Include called regions gr.attrs['call_reg'] = '%d-%d' % (gr.start, gr.end) gr.attrs['call_len'] = (gr.end - gr.start + 1) # Append to json object out_json['padded_gtf'].append(str(gr)) rpos = seg[-1][0] + 1 # Add padding for end of sequence if rpos < len(_refseq.seq): for p in range(rpos, len(_refseq.seq)): tmp.append((p, str(_refseq.seq[p]), '*', qpos)) qpos += 1 # Validate the alignment vseq = ''.join(t[2] for t in tmp if t[3] != -1) if baln.validate_alignment(tmp, _refseq.seq, vseq): if not quiet: print('%s alignment validation passed' % newname, file=sys.stderr) out_json['padded_alignments'][newname] = tmp for s in out_json['padded_gtf']: if not quiet: print(s, file=sys.stdout) with open(out_aln, 'w') as outh: print(json.dumps(out_json), file=outh) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'pairwise_align', quiet, logfile) return out_aln
def join_reads( fq1=None, fq2=None, outdir=".", min_overlap=None, max_overlap=None, allow_outies=None, encoding=None, ncpu=1, keep_tmp=False, quiet=False, logfile=None, debug=False, ): """ Pipeline step to join paired-end reads Args: fq1 (str): Path to fastq file with read 1 fq2 (str): Path to fastq file with read 2 outdir (str): Path to output directory min_overlap (int): The minimum required overlap length max_overlap (int): Maximum overlap length allow_outies (bool): Try combining "outie" reads encoding (str): Quality score encoding ncpu (int): Number of CPUs to use keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out1 (str): Path to fastq file with unjoined read 1 out2 (str): Path to fastq file with unjoined read 2 outU (str): Path to fastq file with joined reads """ # Check inputs if fq1 is not None and fq2 is not None: pass # Both are present else: msg = "Incorrect combination of reads: fq1=%s fq2=%s" % (fq1, fq2) raise sysutils.PipelineStepError(msg) # Check for executable sysutils.check_dependency('flash') # Get encoding if encoding is None: encoding = helpers.guess_encoding(fq1) # Outputs outU = os.path.join(outdir, 'joined.fastq') out1 = os.path.join(outdir, 'notjoined_1.fastq') out2 = os.path.join(outdir, 'notjoined_2.fastq') # Temporary directory tempdir = sysutils.create_tempdir('join_reads', None, quiet, logfile) # Flash command cmd1 = [ 'flash', '-t', '%d' % ncpu, '-d', tempdir, ] if encoding != "Phred+33": cmd1 += ['-p', '64'] if min_overlap is not None: cmd1 += ['-m', '%d' % min_overlap] if max_overlap is not None: cmd1 += ['-M', '%d' % max_overlap] if allow_outies is True: cmd1 += ['-O'] cmd1 += [fq1, fq2] cmd2 = [ 'mv', os.path.join(tempdir, 'out.extendedFrags.fastq'), outU, ] cmd3 = [ 'mv', os.path.join(tempdir, 'out.notCombined_1.fastq'), out1, ] cmd4 = [ 'mv', os.path.join(tempdir, 'out.notCombined_2.fastq'), out2, ] sysutils.command_runner([ cmd1, cmd2, cmd3, cmd4, ], 'join_reads', quiet, logfile, debug) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'join_reads', quiet, logfile) return out1, out2, outU
def sample_reads( fq1=None, fq2=None, fqU=None, outdir='.', nreads=None, frac=None, seed=None, quiet=False, logfile=None, debug=False, ): """ Args: fq1 (str): Path to fastq file with read 1 fq2 (str): Path to fastq file with read 2 fqU (str): Path to fastq file with unpaired reads outdir (str): Path to output directory nreads (int): Number of reads to sample frac (float): Fraction of reads to sample seed (int): Seed for random number generator quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out1 (str): Path to sampled fastq file with read 1 out2 (str): Path to sampled fastq file with read 2 outU (str): Path to sampled fastq file with unpaired reads """ # Check inputs if fq1 is not None and fq2 is not None and fqU is None: input_reads = "paired" # Paired end elif fq1 is None and fq2 is None and fqU is not None: input_reads = "single" # Single end elif fq1 is not None and fq2 is not None and fqU is not None: input_reads = "both" else: msg = "incorrect input reads; requires either " msg += "(--fq1 AND --fq2) OR (--fqU) OR (--fq1 AND --fq2 AND --fqU)" raise MissingRequiredArgument(msg) # Check dependencies sysutils.check_dependency('seqtk') # Set seed seed = seed if seed is not None else random.randrange(1, 1000) sysutils.log_message('[--- sample_reads ---] Random seed = %d\n' % seed, quiet, logfile) # Set nreads/frac if frac is not None: if frac <= 0 or frac > 1: raise sysutils.PipelineStepError('--frac must be > 0 and <= 1.') frac_arg = '%f' % frac else: frac_arg = '%d' % nreads cmds = None if input_reads == 'single': out1 = out2 = None outU = os.path.join(outdir, 'sample_U.fastq') cmds = [ [ 'seqtk', 'sample', '-s%d' % seed, fqU, frac_arg, '>', outU, ], ] elif input_reads == 'paired': out1 = os.path.join(outdir, 'sample_1.fastq') out2 = os.path.join(outdir, 'sample_2.fastq') outU = None cmds = [ [ 'seqtk', 'sample', '-s%d' % seed, fq1, frac_arg, '>', out1, ], [ 'seqtk', 'sample', '-s%d' % seed, fq2, frac_arg, '>', out2, ], ] elif input_reads == 'both': out1 = os.path.join(outdir, 'sample_1.fastq') out2 = os.path.join(outdir, 'sample_2.fastq') outU = os.path.join(outdir, 'sample_U.fastq') cmds = [ [ 'seqtk', 'sample', '-s%d' % seed, fq1, frac_arg, '>', out1, ], [ 'seqtk', 'sample', '-s%d' % seed, fq2, frac_arg, '>', out2, ], [ 'seqtk', 'sample', '-s%d' % seed, fqU, frac_arg, '>', outU, ], ] sysutils.command_runner(cmds, 'sample_reads', quiet, logfile, debug) return out1, out2, outU
def cliquesnv(fq1=None, fq2=None, fqU=None, ref_fa=None, outdir='.', jardir='.', O22min=None, O22minfreq=None, printlog=None, single=False, merging=None, fasta_format='extended4', outputstart=None, outputend=None, keep_tmp=False, quiet=False, logfile=None, debug=False, ncpu=1): # check if paired vs. single if fq1 is None and fq2 is None and fqU is not None: single = True # check dependencies and required arguments if fq1 is None and fq2 is None and fqU is None: raise MissingRequiredArgument("No fastq files given.") if single == False and (fq1 is None or fq2 is None): raise MissingRequiredArgument("Either fq1 or fq2 missing.") if ref_fa is None: raise MissingRequiredArgument("Reference FASTA missing.") sysutils.check_dependency('samtools') sysutils.check_dependency('bwa') if (os.path.isfile(os.path.join(jardir, "clique-snv.jar"))): print("CliqueSNV JAR file found.") else: raise MissingRequiredArgument("No JAR file found.") # Temporary directory tempdir = sysutils.create_tempdir('clique_snv', None, quiet, logfile) # Load reference fasta refs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')} # Identify reconstruction regions regions = [] for rname, s in refs.items(): regions.append(('cs%02d' % (len(regions) + 1), rname, 1, len(s))) sysutils.log_message('[--- Haplotype Reconstruction Regions ---]\n', quiet, logfile) for iv in regions: sysutils.log_message('%s -- %s:%d-%d\n' % iv, quiet, logfile) if single == False: #paired end # remove .1 and .2 from read names fq1_c = os.path.join(tempdir, "fq1_corrected.fastq") fq2_c = os.path.join(tempdir, "fq2_corrected.fastq") cmd01 = ["cat %s | sed 's/\.1 / /' > %s" % (fq1, fq1_c)] cmd02 = ["cat %s | sed 's/\.2 / /' > %s" % (fq2, fq2_c)] sysutils.command_runner([cmd01, cmd02], 'clique_snv:setup', quiet, logfile, debug) # Create alignment for each REFERENCE in the reconstruction regions alnmap = {} for cs, rname, spos, epos in regions: if rname not in alnmap: # Create alignment tmp_ref_fa = os.path.join(tempdir, 'ref.%d.fa' % len(alnmap)) tmp_sam = os.path.join(tempdir, 'aligned.%d.sam' % len(alnmap)) SeqIO.write(refs[rname], tmp_ref_fa, 'fasta') cmd1 = [ 'bwa', 'index', tmp_ref_fa, ] cmd2 = [ 'bwa', 'mem', tmp_ref_fa, fq1_c, fq2_c, '|', 'samtools', 'view', '-h', '-F', '12', '>', tmp_sam, ] cmd3 = ['rm', '-f', '%s.*' % tmp_ref_fa] sysutils.command_runner([cmd1, cmd2, cmd3], 'clique_snv:setup', quiet, logfile, debug) alnmap[rname] = (tmp_ref_fa, tmp_sam) else: #single read # Create alignment for each REFERENCE in the reconstruction regions alnmap = {} for cs, rname, spos, epos in regions: if rname not in alnmap: # Create alignment tmp_ref_fa = os.path.join(tempdir, 'ref.%d.fa' % len(alnmap)) tmp_sam = os.path.join(tempdir, 'aligned.%d.sam' % len(alnmap)) SeqIO.write(refs[rname], tmp_ref_fa, 'fasta') cmd1 = [ 'bwa', 'index', tmp_ref_fa, ] cmd2 = [ 'bwa', 'mem', tmp_ref_fa, fqU, '|', 'samtools', 'view', '-h', '-F', '12', '>', tmp_sam, ] cmd3 = ['rm', '-f', '%s.*' % tmp_ref_fa] sysutils.command_runner([cmd1, cmd2, cmd3], 'clique_snv:setup', quiet, logfile, debug) alnmap[rname] = (tmp_ref_fa, tmp_sam) # Run CliqueSNV for each region cmd4 = ['mkdir -p %s' % os.path.join(outdir, 'clique_snv')] sysutils.command_runner([ cmd4, ], stage='cliquesnv', quiet=quiet, logfile=logfile, debug=debug) i = 0 #index for filenames for cs, rname, spos, epos in regions: msg = "Reconstruction region %s:" % cs msg += " %s:%d-%d\n" % (rname, spos, epos) sysutils.log_message(msg, quiet, logfile) # rename the cliquesnv number (cs##) to include region (now: cs##_reg) cs = '%s_%s' % (cs, rname.split('|')[-2]) samfile = os.path.join(tempdir, 'aligned.%d.sam' % i) method = 'snv-illumina' cmd5 = [ 'java -jar %s -m %s -in %s -threads %d -outDir %s -fdf %s' % (os.path.join(jardir, 'clique-snv.jar'), method, samfile, ncpu, tempdir, fasta_format) ] if O22min is not None: cmd5 += ['-t %f' % O22min] if O22minfreq is not None: cmd5 += ['-tf %f' % O22minfreq] if printlog is not None: cmd5 += ['-log'] if merging is not None: cmd5 += ['-cm %s' % merging] if outputstart is not None: cmd5 += ['-os %d' % outputstart] if outputend is not None: cmd5 += ['-oe %d' % outputend] sysutils.command_runner([ cmd5, ], stage='clique_snv', quiet=quiet, logfile=logfile, debug=debug) # copy output file and delete tempdir outname1 = 'aligned.%d.txt' % i outname2 = 'aligned.%d.fasta' % i os.makedirs(os.path.join(outdir, 'clique_snv/%s' % cs), exist_ok=True) if os.path.exists(os.path.join(tempdir, '%s' % outname1)): shutil.copy( os.path.join(tempdir, '%s' % outname1), os.path.join(outdir, 'clique_snv/%s/%s.txt' % (cs, cs))) if os.path.exists(os.path.join(tempdir, '%s' % outname2)): shutil.copy( os.path.join(tempdir, '%s' % outname2), os.path.join(outdir, 'clique_snv/%s/%s.fasta' % (cs, cs))) # parse output file with open( os.path.join(outdir, 'clique_snv/%s/%s_summary.txt' % (cs, cs)), 'w') as sumfile, open( os.path.join(outdir, 'clique_snv/%s/%s.txt' % (cs, cs)), 'r') as infile: l = infile.readlines() freqs = [] haps = [] tempnum = '' for line in l: if "SNV got" in line: tempnum = line.split(' ')[2] if "frequency" in line: freqs += [float(line.split(' ')[2][:-2])] if "haplotype=" in line: haps += [line.split('=')[1][1:-2]] sumfile.write('CliqueSNV_num_hap\t%s\n' % tempnum) freq_sqrd = [x**2 for x in freqs] freq_sqrd_sum = sum(freq_sqrd) hap_div = ((old_div(7000, (7000 - 1))) * (1 - freq_sqrd_sum)) sumfile.write('CliqueSNV_hap_diversity\t%s\n' % hap_div) sumfile.write('CliqueSNV_seq_len\t%s\n' % len(haps[0])) with open(os.path.join(outdir, 'clique_snv/%s/%s.fasta' % (cs, cs)), 'r') as fastafile: fastadata = fastafile.read().replace('aligned.%d' % i, rname) with open( os.path.join(outdir, 'clique_snv/%s/%s.fasta' % (cs, cs)), 'w') as newfastafile: newfastafile.write(fastadata) i += 1 if not keep_tmp: sysutils.remove_tempdir(tempdir, 'clique_snv', quiet, logfile) return
def predict_haplo( fq1=None, fq2=None, ref_fa=None, region_txt=None, outdir='.', min_readlength=36, keep_tmp=False, quiet=False, logfile=None, debug=False, ): """ Pipeline step to assemble haplotypes Args: fq1 (str): Path to fastq file with read 1 fq2 (str): Path to fastq file with read 2 ref_fa (str): Path to reference fasta file region_txt (str): Path to region file outdir (str): Path to output directory min_readlength (int): Minimum readlength passed to PredictHaplo keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: best_fa (list): Path to best haplotype files (FASTA) """ # Check dependencies sysutils.check_dependency('PredictHaplo-Paired') sysutils.check_dependency('bwa') # Temporary directory tempdir = sysutils.create_tempdir('predict_haplo', None, quiet, logfile) # Load reference fasta refs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')} # Identify reconstruction regions regions = [] if region_txt: sysutils.log_message('Found regions file.\n', quiet, logfile) for l in open(region_txt, 'r'): rname, spos, epos = sequtils.region_to_tuple(l.strip()) if rname not in refs: raise PipelineStepError("ERROR: reference %s not valid" % rname) spos = 1 if spos is None else spos epos = len(refs[rname]) if epos is None else epos regions.append(('PH%02d' % (len(regions) + 1), rname, spos, epos)) else: for rname, s in refs.items(): regions.append(('PH%02d' % (len(regions) + 1), rname, 1, len(s))) sysutils.log_message('[--- Haplotype Reconstruction Regions ---]\n', quiet, logfile) for iv in regions: sysutils.log_message('%s -- %s:%d-%d\n' % iv, quiet, logfile) # Create alignment for each REFERENCE in the reconstruction regions alnmap = {} for ph, rname, spos, epos in regions: if rname not in alnmap: # Create alignment tmp_ref_fa = os.path.join(tempdir, 'ref.%d.fa' % len(alnmap)) tmp_sam = os.path.join(tempdir, 'aligned.%d.sam' % len(alnmap)) SeqIO.write(refs[rname], tmp_ref_fa, 'fasta') cmd1 = [ 'bwa', 'index', tmp_ref_fa, ] cmd2 = [ 'bwa', 'mem', tmp_ref_fa, fq1, fq2, '|', 'samtools', 'view', '-h', '-F', '12', '>', tmp_sam, ] cmd3 = ['rm', '-f', '%s.*' % tmp_ref_fa] sysutils.command_runner([cmd1, cmd2, cmd3], 'predict_haplo:setup', quiet, logfile, debug) alnmap[rname] = (tmp_ref_fa, tmp_sam) best_fa = [] # Run PredictHaplo for each REGION for ph, rname, spos, epos in regions: msg = "Reconstruction region %s:" % ph msg += " %s:%d-%d\n" % (rname, spos, epos) sysutils.log_message(msg, quiet, logfile) # Construct params specific for region reg_params = dict(DEFAULTS) reg_params['min_readlength'] = min_readlength reg_params['reconstruction_start'] = spos reg_params['reconstruction_stop'] = epos reg_params['prefix'] = '%s_out.' % ph # Lookup reference and alignment filename reg_params['ref_fasta'] = os.path.basename(alnmap[rname][0]) reg_params['alignment'] = os.path.basename(alnmap[rname][1]) # Create config file for region config_file = '%s.config' % ph with open(os.path.join(tempdir, config_file), 'w') as outh: tmpconfig = config_template % reg_params print(tmpconfig.replace('###', '%'), file=outh) try: # Run PredictHaplo cmd1 = [ 'cd', tempdir, ] cmd2 = [ 'PredictHaplo-Paired', config_file, '&>', '%s.log' % config_file ] sysutils.command_runner([ cmd1, cmd2, ], 'predict_haplo:%s' % ph, quiet, logfile, debug) # Copy files dest = os.path.join(outdir, ph) if not os.path.exists(dest): os.makedirs(dest) shutil.copy(os.path.join(tempdir, '%s.config.log' % ph), dest) for f in glob(os.path.join(tempdir, '%s_out*global*.fas' % ph)): shutil.copy(f, dest) for f in glob(os.path.join(tempdir, '%s_out*global*.html' % ph)): shutil.copy(f, dest) bf, bh = rename_best(dest, ph) best_fa.append((ph, bf)) except PipelineStepError as e: print(e, file=sys.stderr) if e.returncode == 139: print("PredictHaplo segfaulted", file=sys.stderr) best_fa.append((ph, None)) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'predict_haplo', quiet, logfile) return best_fa
def model_test(seqs=None, outname='modeltest_results', run_id=None, data_type='nt', partitions=None, seed=None, topology='ml', utree=None, force=None, asc_bias=None, frequencies=None, het=None, models=None, schemes=None, template=None, ncpu=1, quiet=False, logfile=None, debug=False, outdir='.', keep_tmp=False): # check dependency sysutils.check_dependency('modeltest-ng') # check required input & input options if seqs is None: msg = "No alignment given" raise sysutils.MissingRequiredArgument(msg) if data_type not in ['nt', 'aa']: raise sysutils.PipelineStepError("Data type not valid") if topology not in [ 'ml', 'mp', 'fixed-ml-jc', 'fixed-ml-gtr', 'fixed-mp', 'random', 'user' ]: raise sysutils.PipelineStepError("Topology not valid") # make tempdir tempdir = sysutils.create_tempdir('model_test', None, quiet, logfile) # add prefix if run_id is not None: outname = run_id + '_' + outname # build command cmd1 = [ 'modeltest-ng -i %s' % seqs, '-t %s' % topology, '-o %s' % os.path.join(tempdir, outname), '-p %d' % ncpu, '-d %s' % data_type ] if partitions is not None: cmd1 += ['-q %s' % partitions] if seed is not None: cmd1 += ['-r %d' % seed] if utree is not None: cmd1 += ['-u %s' % utree] if force is True: cmd1 += ['--force'] if asc_bias is not None and asc_bias in [ 'lewis', 'felsenstein', 'stamatakis' ]: cmd1 += ['-a %s' % asc_bias] elif asc_bias is not None: raise sysutils.PipelineStepError("ASC bias correction not valid") if frequencies is not None and frequencies in ['e', 'f']: cmd1 += ['-f %s' % frequencies] elif frequencies is not None: raise sysutils.PipelineStepError("Frequencies not valid") if het is not None and het in ['u', 'i', 'g', 'f']: cmd1 += ['-h %s' % het] elif het is not None: raise sysutils.PipelineStepError("Rate heterogeneity not valid") if models is not None: with open(models, 'r') as f: model_list = f.read().splitlines() for m in model_list: if data_type == 'nt' and m not in [ 'JC', 'HKY', 'TrN', 'TPM1', 'TPM2', 'TPM3', 'TIM1', 'TIM2', 'TIM3', 'TVM', 'GTR' ]: raise sysutils.PipelineStepError( "At least one model is not valid") elif data_type == 'aa' and m not in [ 'DAYHOFF', 'LG', 'DCMUT', 'JTT', 'MTREV', 'WAG', 'RTREV', 'CPREV', 'VT', 'BLOSUM62', 'MTMAM', 'MTART', 'MTZOA', 'PMB', 'HIVB', 'HIVW', 'JTTDCMUT', 'FLU', 'SMTREV' ]: raise sysutils.PipelineStepError( "At least one model is not valid") cmd1 += ['-m %s' % str(model_list)[1:-1]] if schemes is not None and schemes in [3, 5, 7, 11, 203]: cmd1 += ['-s %d' % schemes] elif schemes is not None: raise sysutils.PipelineStepError("Schemes not valid") if template is not None and template in [ 'raxml', 'phyml', 'mrbayes', 'paup' ]: cmd1 += ['-T %s' % template] elif template is not None: raise sysutils.PipelineStepError("Template not valid") # run command try: sysutils.command_runner([ cmd1, ], 'model_test', quiet, logfile, debug) except sysutils.PipelineStepError as p: if p.returncode == -6: print("Warning: ignoring returncode -6") else: raise sysutils.PipelineStepError("Error in ModelTest-NG") # copy output file and delete tempdir if os.path.exists(os.path.join(tempdir, '%s.out' % outname)): shutil.copy(os.path.join(tempdir, '%s.out' % outname), os.path.abspath(outdir)) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'model_test', quiet, logfile) # Parse .out file and write TSV summary file criteria = [] bestmods = [] with open(os.path.join(outdir, '%s.out' % outname)) as f1: for line in f1.read().splitlines(): if "Best model according to" in line: criteria += line.split(' ')[-1:] if "Model: " in line: bestmods += line.split(' ')[-1:] with open(os.path.join(outdir, '%s_summary.tsv' % outname), 'w') as f2: f2.write('File\tCriteria\tBest Model\n') for i in range(len(criteria)): f2.write('%s\t%s\t%s\n' % (seqs, criteria[i], bestmods[i])) # completion message cmd2 = [ 'echo', 'Stage completed. Output file is located here: %s\n' % os.path.abspath(os.path.join(outdir, '%s.out' % outname)), 'echo', 'Summary TSV file is located here: %s\n' % os.path.abspath(os.path.join(outdir, '%s_summary.tsv' % outname)) ] sysutils.command_runner([ cmd2, ], 'model_test', quiet, logfile, debug) return
def assemble_denovo_spades(fq1=None, fq2=None, fqU=None, outdir='.', no_error_correction=False, subsample=None, seed=None, ncpu=1, keep_tmp=False, quiet=False, logfile=None, debug=False, **kwargs): """ Pipeline step to assemble reads using spades (denovo) Args: fq1 (str): Path to fastq file with read 1 fq2 (str): Path to fastq file with read 2 fqU (str): Path to fastq file with unpaired reads outdir (str): Path to output directory no_error_correction (bool): do not perform error correction subsample (int): use a subsample of reads for assembly seed (int): Seed for random number generator ncpu (int): Number of CPUs to use keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run **kwargs: Not used. Returns: out_fa (str): Path to assembled contigs file (fasta format) out_summary (str): Path to assembly summary """ # Check inputs if fq1 is not None and fq2 is not None and fqU is None: input_reads = "paired" # Paired end elif fq1 is None and fq2 is None and fqU is not None: input_reads = "single" # Single end elif fq1 is not None and fq2 is not None and fqU is not None: input_reads = "both" else: msg = "incorrect input reads; requires either " msg += "(--fq1 AND --fq2) OR (--fqU) OR (--fq1 AND --fq2 AND --fqU)" raise MissingRequiredArgument(msg) # Check dependencies sysutils.check_dependency('spades.py') # Outputs out_fa = os.path.join(outdir, 'denovo_contigs.fna') out_summary = os.path.join(outdir, 'denovo_summary.txt') # Temporary directory tempdir = sysutils.create_tempdir('assemble_spades', None, quiet, logfile) # Subsample if subsample is not None: full1, full2, fullU = fq1, fq2, fqU fq1, fq2, fqU = sample_reads.sample_reads(fq1=full1, fq2=full2, fqU=fullU, outdir=tempdir, nreads=subsample, seed=seed, quiet=quiet, logfile=logfile, debug=debug) # spades command cmd1 = [ 'spades.py', '-o', tempdir, '-t', '%d' % ncpu, ] if input_reads in [ 'paired', 'both', ]: cmd1 += [ '-1', os.path.abspath(fq1), '-2', os.path.abspath(fq2), ] if input_reads in [ 'single', 'both', ]: cmd1 += [ '-s', os.path.abspath(fqU), ] if no_error_correction: cmd1 += [ '--only-assembler', ] sysutils.command_runner([ cmd1, ], 'assemble_spades', quiet, logfile, debug) shutil.copy(os.path.join(tempdir, 'contigs.fasta'), out_fa) if os.path.isfile(out_fa): with open(out_summary, 'w') as outh: sequtils.assembly_stats(open(out_fa, 'rU'), outh) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'assemble_spades', quiet, logfile) return out_fa, out_summary
def assemble_amplicons(contigs_fa=None, ref_fa=None, ref_gtf=None, outdir='.', sample_id='sampleXX', padding=50, min_contig_len=200, keep_tmp=False, quiet=False, logfile=None, debug=False): """ Pipeline step to assemble contigs using reference and amplicon regions Args: contigs_fa (str): Path to fasta file with assembled contigs ref_fa (str): Path to reference fasta file ref_gtf (str): Path to reference GTF file with amplicons outdir (str): Path to output directory sample_id (str): Name to append to scaffold sequence padding (int): Bases to include outside reference annotation min_contig_len (int): Minimum contig length for tiling path keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out_assembly (str): Path to assembled amplicons (FASTA) out_summary (str): Path to assembly summary out_padded (str): Path to padded output file """ # Check dependencies sysutils.check_dependency('nucmer') sysutils.check_dependency('delta-filter') sysutils.check_dependency('show-tiling') # Outputs out_assembly = os.path.join(outdir, 'amplicon_assembly.fna') out_summary = os.path.join(outdir, 'amplicon_summary.txt') out_padded = os.path.join(outdir, 'amplicon_padded.out') if os.path.exists(out_padded): os.unlink(out_padded) # Temporary directory tempdir = sysutils.create_tempdir('assemble_amplicons', None, quiet, logfile) # Create fasta file with sequence IDs only (remove decription) tmp_contigs_fa = sequtils.clean_seqnames_file(contigs_fa, tempdir) # Load reference sequence(s) refseqs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')} # For each amplicon, extract the sequence from the reference and scaffold using nucmer amplicon_alignments = [] amps = [ gl for gl in gtfparse.gtf_parser(ref_gtf) if gl.feature == 'amplicon' ] for gl in amps: msg = 'Amplicon ref|%s|reg|%s\n' % (gl.chrom, gl.attrs['name']) sysutils.log_message(msg, quiet, logfile) # Extract reference amplicon amp_s = max(0, (gl.start - 1) - padding) amp_e = min(len(refseqs[gl.chrom]), gl.end + padding) ampseq = refseqs[gl.chrom].seq[amp_s:amp_e] amplicon_fa = os.path.join(tempdir, 'subject.fa') with open(amplicon_fa, 'w') as outh: print('>ref|%s|reg|%s' % (gl.chrom, gl.attrs['name']), file=outh) print(sequtils.wrap(str(ampseq)), file=outh) # Align with nucmer fil, til = alignutils.align_nucmer(tmp_contigs_fa, amplicon_fa, tempdir, min_contig_len=min_contig_len, quiet=quiet, logfile=logfile, debug=debug) # Skip everything else if debugging if debug: continue # Parse tiling and show alignments trows = [alignutils.TilingRow(l) for l in open(til, 'rU')] if not trows: amplicon_alignments.append((gl.chrom, gl.attrs['name'], None)) else: # Initialize alignment amp_seq = SeqIO.read(amplicon_fa, 'fasta') combined = alignutils.EmptyReferenceAlignment( str(amp_seq.seq).lower()) for tr in trows: out = alignutils.show_aligns(tr.ref, tr.qry, fil) for nucaln in alignutils.parse_show_aligns(out): combined = combined.merge_alignments(nucaln) with open(out_padded, 'a') as outh: print('%s\n%s\n%s' % (tr, combined.raln(), combined.qaln()), file=outh) amplicon_alignments.append((gl.chrom, gl.attrs['name'], combined)) # Cleanup for f in [fil, til, amplicon_fa]: if os.path.isfile(f): os.unlink(f) # Write to output files with open(out_assembly, 'w') as outseq, open(out_summary, 'w') as outsum: for ref_id, reg, combined in amplicon_alignments: amp_id = sequtils.make_seq_id(sid=sample_id, ref=ref_id, reg=reg) if combined is None: msg1 = '%s\tFAIL\t%d' % (amp_id, 0) msg2 = u'%s\tFAIL\t%d\t%s\n' % (amp_id, 0, u"👎🏼") if logfile is not None: print(u'%s\tFAIL\t%d\t%s' % (amp_id, 0, u"👎🏼"), file=logfile) else: scaf, s, e = combined.scaffold2() msg1 = '%s\tPASS\t%d' % (amp_id, len(scaf)) msg2 = u'%s\tPASS\t%d\t%s\n' % (amp_id, len(scaf), u"👍🏼") print('>%s' % (amp_id), file=outseq) print('%s' % sequtils.wrap(scaf), file=outseq) print(msg1, file=outsum) sysutils.log_message(msg2, quiet, logfile) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'assemble_amplicons', quiet, logfile) return out_assembly, out_summary, out_padded
def stageparser(parser): """ Add stage-specific options to argparse parser Args: parser (argparse.ArgumentParser): ArgumentParser object Returns: None """ group1 = parser.add_argument_group('Input/Output') group1.add_argument('--fq1', type=sysutils.existing_file, help='Fastq file with read 1') group1.add_argument('--fq2', type=sysutils.existing_file, help='Fastq file with read 2') group1.add_argument('--fqU', type=sysutils.existing_file, help='Fastq file with unpaired reads') group1.add_argument('--outdir', type=sysutils.existing_dir, default='.', help='Output directory') group2 = parser.add_argument_group('Assembly options') try: sysutils.check_dependency('Trinity') is_trinity = True except sysutils.PipelineStepError: is_trinity = False try: sysutils.check_dependency('spades.py') is_spades = True except sysutils.PipelineStepError: is_spades = False if is_trinity and is_spades: group2.add_argument('--assembler', default='spades', choices=[ 'spades', 'trinity', ], help='''Assembler to use.''') elif is_trinity: group2.set_defaults(assembler="trinity") elif is_spades: group2.set_defaults(assembler="spades") if is_spades: group2.add_argument( '--no_error_correction', action='store_true', help='Do not perform error correction [spades only]') if is_trinity: group2.add_argument('--min_contig_length', type=int, default=200, help='''Minimum assembled contig length to report [Trinity only]''') group2.add_argument('--subsample', type=int, help='Use a subsample of reads for assembly.') group2.add_argument('--seed', type=int, help='''Seed for random number generator (ignored if not subsampling).''') group3 = parser.add_argument_group('Settings') group3.add_argument('--ncpu', type=int, default=1, help='Number of CPU to use') group3.add_argument('--keep_tmp', action='store_true', help='Keep temporary directory') group3.add_argument('--quiet', action='store_true', help='''Do not write output to console (silence stdout and stderr)''') group3.add_argument('--logfile', type=argparse.FileType('a'), help='Append console output to this file') group3.add_argument('--debug', action='store_true', help='Print commands but do not run') parser.set_defaults(func=assemble_denovo)
def align_reads( fq1=None, fq2=None, fqU=None, ref_fa=None, outdir='.', bt2_preset='sensitive-local', sample_id='sampleXX', no_realign=False, remove_duplicates=False, encoding=None, ncpu=1, xmx=sysutils.get_java_heap_size(), keep_tmp=False, quiet=False, logfile=None, debug=False, ): """ Pipeline step to align reads Args: fq1 (str): Path to fastq file with read 1 fq2 (str): Path to fastq file with read 2 fqU (str): Path to fastq file with unpaired reads ref_fa (str): Path to reference fasta file outdir (str): Path to output directory bt2_preset (str): Bowtie2 preset to use for alignment sample_id (str): Read group ID no_realign (bool): Do not realign indels remove_duplicates (bool): Remove duplicates from final alignment encoding (str): Quality score encoding ncpu (int): Number of CPUs to use xmx (int): Maximum heap size for JVM in GB keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out_aligned (str): Path to aligned BAM file out_bt2 (str): Path to bowtie2 report """ # Check inputs if fq1 is not None and fq2 is not None and fqU is None: input_reads = "paired" # Paired end elif fq1 is None and fq2 is None and fqU is not None: input_reads = "single" # Single end elif fq1 is not None and fq2 is not None and fqU is not None: input_reads = "both" else: msg = "incorrect input reads; requires either " msg += "(--fq1 AND --fq2) OR (--fqU) OR (--fq1 AND --fq2 AND --fqU)" raise MissingRequiredArgument(msg) if encoding is None: if input_reads == 'single': encoding = helpers.guess_encoding(fqU) else: encoding = helpers.guess_encoding(fq1) # Check dependencies sysutils.check_dependency('bowtie2') sysutils.check_dependency('samtools') sysutils.check_dependency('picard') # Identify correct command for GATK GATK_BIN = sysutils.determine_dependency_path(['gatk', 'gatk3']) # Set JVM heap argument (for GATK) JAVA_HEAP = '_JAVA_OPTIONS="-Xmx%dg"' % xmx # Outputs out_aligned = os.path.join(outdir, 'aligned.bam') out_bt2 = os.path.join(outdir, 'aligned.bt2.out') # Temporary directory tempdir = sysutils.create_tempdir('align_reads', None, quiet, logfile) # Copy and index initial reference curref = os.path.join(tempdir, 'initial.fasta') cmd1 = ['cp', ref_fa, curref] cmd2 = ['samtools', 'faidx', curref] cmd3 = [ 'picard', 'CreateSequenceDictionary', 'R=%s' % curref, 'O=%s' % os.path.join(tempdir, 'initial.dict') ] cmd4 = ['bowtie2-build', curref, os.path.join(tempdir, 'initial')] sysutils.command_runner([cmd1, cmd2, cmd3, cmd4], 'align_reads:index', quiet, logfile, debug) # Align with bowtie2 cmd5 = [ 'bowtie2', '-p', '%d' % ncpu, '--phred33' if encoding == "Phred+33" else '--phred64', '--no-unal', '--rg-id', sample_id, '--rg', 'SM:%s' % sample_id, '--rg', 'LB:1', '--rg', 'PU:1', '--rg', 'PL:illumina', '--%s' % bt2_preset, '-x', '%s' % os.path.join(tempdir, 'initial'), ] if input_reads in [ 'paired', 'both', ]: cmd5 += [ '-1', fq1, '-2', fq2, ] elif input_reads in [ 'single', 'both', ]: cmd5 += [ '-U', fqU, ] cmd5 += [ '-S', os.path.join(tempdir, 'aligned.bt2.sam'), ] cmd5 += [ '2>', out_bt2, ] try: sysutils.command_runner([ cmd5, ], 'align_reads:bowtie2', quiet, logfile, debug) except PipelineStepError as e: if os.path.exists(out_bt2): with open(out_bt2, 'r') as fh: print('[--- bowtie2 stderr ---]\n%s' % fh.read(), file=sys.stderr) raise cmd6 = [ 'samtools', 'view', '-u', os.path.join(tempdir, 'aligned.bt2.sam'), '|', 'samtools', 'sort', '>', os.path.join(tempdir, 'sorted.bam'), ] cmd7 = [ 'samtools', 'index', os.path.join(tempdir, 'sorted.bam'), ] sysutils.command_runner([ cmd6, cmd7, ], 'align_reads:samsort', quiet, logfile, debug) cur_bam = os.path.join(tempdir, 'sorted.bam') if remove_duplicates: sysutils.log_message('[--- Removing duplicates ---]', quiet, logfile) else: sysutils.log_message('[--- Marking duplicates ---]', quiet, logfile) # MarkDuplicates cmd8 = [ 'picard', 'MarkDuplicates', 'CREATE_INDEX=true', 'USE_JDK_DEFLATER=true', 'USE_JDK_INFLATER=true', 'M=%s' % os.path.join(tempdir, 'rmdup.metrics.txt'), 'I=%s' % cur_bam, 'O=%s' % os.path.join(tempdir, 'rmdup.bam'), ] if remove_duplicates: cmd8 += [ 'REMOVE_DUPLICATES=true', ] sysutils.command_runner([ cmd8, ], 'align_reads:markdups', quiet, logfile, debug) cur_bam = os.path.join(tempdir, 'rmdup.bam') if no_realign: print('[--- Skipping realignment ---]', file=sys.stderr) else: # RealignerTargetCreator cmd9 = [ JAVA_HEAP, GATK_BIN, '-T', 'RealignerTargetCreator', '-I', cur_bam, '-R', curref, '-o', os.path.join(tempdir, 'tmp.intervals'), ] # IndelRealigner cmd10 = [ JAVA_HEAP, GATK_BIN, '-T', 'IndelRealigner', '--use_jdk_deflater', '--use_jdk_inflater', '-maxReads', '1000000', '-dt', 'NONE', '-I', cur_bam, '-R', curref, '-targetIntervals', os.path.join(tempdir, 'tmp.intervals'), '-o', os.path.join(tempdir, 'realign.bam') ] sysutils.command_runner([ cmd9, cmd10, ], 'align_reads:realign', quiet, logfile, debug) cur_bam = os.path.join(tempdir, 'realign.bam') # Check that cur_bam was created if not os.path.exists(cur_bam): msg = "BAM does not exist: %s" % cur_bam raise sysutils.PipelineStepError(msg) cmd11a = [ 'rm', '-f', out_aligned, ] cmd11b = [ 'mv', cur_bam, out_aligned, ] cmd11c = [ 'samtools', 'index', out_aligned, ] sysutils.command_runner([ cmd11a, cmd11b, cmd11c, ], 'align_reads:copy', quiet, logfile, debug) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'align_reads', quiet, logfile) return out_aligned, out_bt2
def demo(outdir=".", refonly=False): try: _ = FileNotFoundError() except NameError: class FileNotFoundError(OSError): pass # This file, demo.py, is located within "stages", so the package root is # up one directory _base = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) #_data = os.path.abspath(os.path.join(_base,'refs')) #_data = os.path.abspath(os.path.join(os.path.dirname(_base), 'bin/refs')) #print(_data) #return #_data = os.path.join(_base, 'data') if not os.path.exists(outdir): os.makedirs(outdir) hpd = os.path.join(outdir, 'haphpipe_demo') if not os.path.exists(hpd): os.makedirs(hpd) refs = os.path.join(outdir, 'haphpipe_demo/refs.tar.gz') # download ref command cmd1 = [ 'curl', '-L', 'https://github.com/gwcbi/haphpipe/blob/master/bin/refs.tar.gz?raw=true', '>', refs ] sysutils.command_runner([ cmd1, ], 'refs') # unzip refs cmd2 = ['tar', '-xzvf', 'haphpipe_demo/refs.tar.gz', '-C', hpd] cmd3 = ['rm', refs] sysutils.command_runner([ cmd2, cmd3, ], 'refs') #dest = os.path.abspath(outdir) #if not os.path.exists(os.path.join(outdir,)) print(_base, file=sys.stderr) if refonly is False: print( "Setting up demo directories and references in outdirectory %s. Demo samples will now run." % os.path.join(outdir, 'haphpipe_demo')) # Check for executable sysutils.check_dependency("fastq-dump") # Demo command cmd1 = ['haphpipe_demo', 'haphpipe_demo'] sysutils.command_runner([ cmd1, ], 'demo') else: print( "Demo was run with --refonly. References are now in outdirectory: %s." % refs)