def assemble_pear(self, input_f, input_r, outdir, name, processes, pearthreads, extraargstring): """Uses PEAR to assemble paired F/R read files in run_parallel. :param input_f: File path to forward Fastq Reads file or folder. :param input_r: File path to reverse Fastq Reads file or folder. :param outdir: File path to the output directory. :param name: File prefix for the assembled reads. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. :param pearthreads: The number of threads per process to use. """ # "~/programs/pear-0.9.4-bin-64/pear-0.9.4-64 -f %s -r %s -o %s -j %s -m %d" inputs = validate_paired_fastq_reads(input_f, input_r) pool = init_pool(min(len(inputs), processes)) printVerbose("\tAssembling reads with pear") debugPrintInputInfo(inputs, "assemble") run_parallel([ProgramRunner(ProgramRunnerCommands.ASSEMBLE_PEAR, [forwards, reverse, "%s/%s_%s" % ( outdir, name, getFileName(forwards)), pearthreads], {"exists": [forwards, reverse], "positive": [pearthreads]}, extraargstring) for forwards, reverse in inputs], pool) printVerbose("Done assembling sequences...") # Grab all the auxillary files (everything not containing ".assembled." aux_files = getInputFiles(outdir, "*", "*.assembled.*", ignore_empty_files=False) # make aux dir for extraneous files and move them there bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def demux_fastx(self, input_f, barcodes, outdir, processes, extraargstring): """Demuxes using FAST X BARCODE SPLITTER. :param input_f: File path to input file or folder of input files. :param barcodes: File path to input barcodes file. :param outdir: Filepath to output directory. :param processes: Number of processes to use to demux input fileset. :param extraargstring: Advanced program parameter string. """ # Get input files files_to_split = getInputFiles(input_f) # Assign the files shard numbers file_id = range(len(files_to_split)) file_id_pairs = zip(files_to_split, file_id) debugPrintInputInfo(files_to_split, "demux") pool = init_pool(min(len(file_id_pairs), processes)) printVerbose("Demuxing sequences...") run_parallel([ProgramRunner(ProgramRunnerCommands.DEMUX_FASTX, [input_, barcodes, "%s/" % outdir, "_%d_demux.fastq" % id_], {"exists": [input_, barcodes]}, extraargstring) for input_, id_ in file_id_pairs], pool) printVerbose("Demuxed sequences.") # Grab all the auxillary files aux_files = getInputFiles(outdir, "unmatched_*", ignore_empty_files=False) # make aux dir for extraneous files and move them there bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def demux_by_name(self, input_f, barcodes, outdir, filetype, processes, extraargstring): """Demuxes using SeqIO. :param input_f: File path to input file or folder of input files. :param barcodes: File path to input barcodes file. :param outdir: Filepath to output directory. :param filetype: Either 'fasta' or 'fastq'. :param processes: Number of processes to use to demux input fileset. :param extraargstring: Advanced program parameter string. """ aux_dir = makeAuxDir(outdir) # Get input files files_to_split = getInputFiles(input_f) # Assign the files shard numbers file_id = range(len(files_to_split)) file_id_pairs = zip(files_to_split, file_id) debugPrintInputInfo(files_to_split, "demux") pool = init_pool(min(len(file_id_pairs), processes)) printVerbose("Demuxing sequences...") run_parallel([PythonRunner(split_on_name, [input_, barcodes, outdir, id_, filetype], {"exists": [input_]}) for input_, id_ in file_id_pairs], pool) # Grab all the auxillary files aux_files = getInputFiles(outdir, "unmatched_*", ignore_empty_files=False) # make aux dir for extraneous files and move them there bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def rename_chewbacca(self, input_f, outdir, filetype, clip, processes): """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes. :param input_f: Filepath to an input file or folder to rename. :param outdir: Filepath to the output directory. :param filetype: Either 'fasta' or 'fastq'. :param clip: If True, remove dereplication counts from sequence names before renaming. :param processes: The maximum number of processes to use. """ # Gather input files inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "rename") pool = init_pool(min(len(inputs), processes)) printVerbose("Renaming sequences...") # Run serialRename in run_parallel run_parallel([PythonRunner(serialRename, [input_, "%s/%s_renamed%s" % (outdir, strip_ixes(input_), os.path.splitext(input_)[1]), filetype, clip], {"exists": [input_]}) for input_ in inputs], pool) printVerbose("Done renaming sequences...") samples_dir = makeDirOrdie("%s_samples" % outdir) samples_files = getInputFiles(outdir, "*.samples", ignore_empty_files=False) bulk_move_to_dir(samples_files, samples_dir) aux_dir = makeAuxDir(outdir) aux_files = getInputFiles(outdir, "*.mapping", ignore_empty_files=False) bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def preclean_bayeshammer(self, input_f, input_r, outdir, processes, bayesthreads, extraargstring): """Assembles reads from two (left and right) fastq files/directories. :param input_f: File path to file or folder of left reads to clean. :param input_r: File path to file or folder of right reads to clean. :param outdir: Filepath to output directory. :param bayesthreads: The number of threads per process to use. :param processes: The maximum number of processes to use. :param kmerlen: The kmer length to use. Default: 16. :param extraargstring: Advanced program parameter string. """ # Collect input files, and validate that they match inputs = validate_paired_fastq_reads(input_f, input_r) pool = init_pool(min(len(inputs), processes)) printVerbose("\tPrecleaning %s reads with Spades-Baye's Hammer..." % len(inputs)) debugPrintInputInfo(inputs, "preclean/fix.") run_parallel([ ProgramRunner(ProgramRunnerCommands.PRECLEAN_SPADES, [forwards, reverse, outdir, bayesthreads], { "exists": [forwards, reverse], "positive": [bayesthreads] }, extraargstring) for forwards, reverse in inputs ], pool) printVerbose("Done cleaning reads.") # Grab all the auxillary files (everything not containing ".assembled." # aux_files = getInputFiles(outdir, "*", "*.assembled.*", ignore_empty_files=False) # make aux dir for extraneous files and move them there # bulk_move_to_dir(aux_files, makeAuxDir(outdir)) # Select output files aux_files = getInputFiles(outdir, "*", ignore_empty_files=False) corrected_dir = "%s/corrected" % outdir bulk_move_to_dir(getInputFiles(corrected_dir, "*"), outdir) aux_files += getInputFiles(outdir, "*unpaired*", ignore_empty_files=False) aux_files += getInputFiles(outdir, "configs", ignore_empty_files=False) # Gather aux files aux_dir = makeAuxDir(outdir) bulk_move_to_dir(aux_files, aux_dir) # Rename output files output_files = getInputFiles(outdir, "*", "corrected_*") for out_file in output_files: move(out_file, "%s/%s_corrected.fastq" % (outdir, strip_ixes(out_file))) # move the last minute log file try: move("%s/corrected_corrected.fastq" % outdir, "%s/corrected_corrected.fastq" % aux_dir) except: pass cleanup_pool(pool)
def query_fasta_vsearch(self, input_f, referencefasta, taxinfo, outdir, processes, simmilarity, coverage, extraargstring): """Compare reference sequences to the fasta-formatted query sequences, using global pairwise alignment. :param input_f: Filepath to a file or folder of files to identify. :param outdir: Filepath to the output directory. :param referencefasta: Filepath to a file or folder of files to use as a reference. :param taxinfo: Filepath to a file containing taxonomic info correlated with the referencefasta. :param simmilarity: The % simmilarity between a query and reference sequence required for positive identification. :param coverage: The % coverage of matching regions between a query and reference sequence required for positive identification. :param processes: The number of processes to use in the identification process. :param extraargstring: Advanced program parameter string. """ # vsearch --usearch_global %s seeds.pick.fasta --db ../data/BiocodePASSED_SAP.txt --id 0.9 \ # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt # expecting a fasta to annotate query_fastas = getInputFiles(input_f) debugPrintInputInfo(query_fastas, "queried for identification.") ref_fastas = getInputFiles(referencefasta) debugPrintInputInfo(ref_fastas, "referenced for sequence identification.") tax_info_files = getInputFiles(taxinfo) debugPrintInputInfo(tax_info_files, "referenced for taxanomic names.") # make sure the number of reference fasta files is the same as the number of tax_info files if len(tax_info_files) != len(ref_fastas): print "Error: The number of reference fastas and taxonomic mapping files is not the same. There must be \ one taxonomic mapping file for each reference fasta." return ref_data_pairs = zip(ref_fastas, tax_info_files) inputs = [x for x in product(query_fastas, ref_fastas)] aln_user_string = "" pool = init_pool(min(len(inputs), processes)) # VSEARCH ALIGNMENT query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string, extraargstring, pool) printVerbose("Parsing output...") # Parse the alignment results and put those that pass the criterion (97 similarity, 85 coverage) in # parsed_BIOCODE.out. Parameters can be changed and this command can be rerun as many times as necessary # # parseVSearchOutputAgainstFasta(vsearch_outfile, taxInfo, output_file, min_simmilarity, min_coverage): inputs = [x for x in product(query_fastas, ref_data_pairs)] debugPrintInputInfo(inputs, "queryied against paired refereces.") run_parallel([PythonRunner(parseVSearchOutputAgainstFasta, ["%s/%s.out" % (outdir, strip_ixes(query)), tax_info, "%s/%s.tax" % (outdir, strip_ixes(query)), simmilarity, coverage], {"exists": [query, ref_fasta, tax_info]}) for query, (ref_fasta, tax_info) in inputs], pool) printVerbose("\nDone parsing...") # Gather and move auxillary files aux_files = getInputFiles(outdir, "*", "*.tax", ignore_empty_files=False) bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def query_fasta_db_vsearch(self, input_f, outdir, ref_fasta, ref_db, simmilarity, coverage, processes, extraargstring): """Compare reference sequences to the fasta-formatted query sequences, using global pairwise alignment. :param input_f: Filepath to a file or folder of files to identify. :param outdir: Filepath to the output directory. :param ref_fasta: Filepath to the curated fasta file to use as a reference. :param ref_db: Filepath to the curated fasta file to use as a reference. :param simmilarity:"Minimum % simmilarity (decimal between 0 and 1) between query and reference sequences required for positive identification. :param coverage:Minimum % coverage (decimal between 0 and 1) required query and reference sequences required for positive identification. :param processes: The number of processes to use in the identification process. :param extraargstring: Advanced program parameter string. """ # blast6 output format http://www.drive5.com/usearch/manual/blast6out.html aln_user_string = "--userfields query+target+id+alnlen+qcov" # coi_fasta = os.path.expanduser("~/ARMS/refs/COI.fasta") # ncbi_db_string = os.path.expanduser("~/ARMS/refs/ncbi.db") coi_fasta = ref_fasta ncbi_db_string = ref_db query_fastas = getInputFiles(input_f) debugPrintInputInfo(query_fastas, "queried against the DB.") inputs = [x for x in product(query_fastas, [coi_fasta])] pool = init_pool(min(len(query_fastas), processes)) # VSEARCH ALIGNMENT query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string, extraargstring, pool) printVerbose("Parsing output...") # Parse the alignment results and put those that pass the criterion (97 similarity, 85 coverage) in # parsed_BIOCODE.out. Parameters can be changed and this command can be rerun as many times as necessary # # parseVSearchOutputAgainstNCBI(vsearch_out, ncbi_db, min_coverage, min_similarity)> parsed_nt.out run_parallel([ PythonRunner(parseVSearchOutputAgainstNCBI, [ "%s/%s.out" % (outdir, strip_ixes(query)), ncbi_db_string, "%s/%s.tax" % (outdir, strip_ixes(query)), simmilarity, coverage ], {"exits": [query, ncbi_db_string]}) for query in query_fastas ], pool) printVerbose("Done processing.") # Gather and move auxillary files aux_files = getInputFiles(outdir, "*", "*.tax", ignore_empty_files=False) bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def clean_trim_adapters_flexbar(self, input_f, adapters, adaptersrc, outdir, allowedns, processes, extraargstring): """Use flexbar to trim adapters and barcodes from sequences. By default, Flexbar does not allow any 'N' \ characters in SEQUENCE, and will toss any sequences that do contain 'N'. To avoid this, use the -u or \ --allowedns flags to specify the maximum number of 'N's to allow :param input_f: Filepath to input file or folder. :param adapters: Filepath to a list of adapters. :param adaptersrc: Filepath to a list of reverse-complemented adapters. :param outdir: Filepath to the output directory. :param allowedns: Non-negative integer value indicating the maximum number of 'N's to tolerate in a sequence. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ inputs = getInputFiles(input_f) pool = init_pool(min(len(inputs), processes)) debugPrintInputInfo(inputs, "trim adapters from") # "flexbar": "flexbar -r \"%s\" -t \"%s\" -ae \"%s\" -a \"%s\"", printVerbose("Trimming barcodes and adapters with flexbar") temp_file_name_template = "%s/temp_%s" debarcoded_file_name_template = "%s/%s_debarcoded" # Trim adapters from the left run_parallel([ ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [ input_file, temp_file_name_template % (outdir, strip_ixes(input_file)), "LEFT", adapters, allowedns ], {"exists": [input_file, adapters]}, extraargstring) for input_file in inputs ], pool) temp_files = getInputFiles(outdir, "temp_*") debugPrintInputInfo(temp_files, "trim adapters from") # Trim the reverse complemented adapters from the right run_parallel([ ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [ input_file, debarcoded_file_name_template % (outdir, strip_ixes(input_file)[5:]), "RIGHT", adaptersrc, allowedns ], {"exists": [input_file, adaptersrc]}, extraargstring) for input_file in temp_files ], pool) printVerbose("Done Trimming sequences.") # Move temp files aux_files = getInputFiles(outdir, "temp_*", ignore_empty_files=False) bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def align_clean_macse(self, input_f, ref, samplesdir, outdir, processes, extraargstring=""): """Removes non-nucleotide characters in MACSE aligned sequences for all fasta files in the samples directory (the samplesDir argument). :param input_f: File path to file or folder of files to clean. :param samplesdir: Filepath to the original, unaligned input files (the inputs to the macse aligner). :param ref: Filepath to the reference file used to align the input files. :param outdir: Filepath to the directory to write outputs to. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ # "macse_format": "java -jar " + programPaths["MACSE"] + " -prog exportAlignment -align \"%s\" \ # -charForRemainingFS - -gc_def 5 -out_AA \"%s\" -out_NT \"%s\" -statFile \"%s\"" inputs = getInputFiles(input_f) pool = init_pool(min(len(inputs), processes)) printVerbose("\t %s Processing MACSE alignments") samples_list = getInputFiles(samplesdir) run_parallel([ProgramRunner(ProgramRunnerCommands.MACSE_FORMAT, ["%s/%s_NT" % (input_f, getFileName(sample)), "%s/%s_AA_macse.fasta" % (outdir, getFileName(sample)), "%s/%s_NT_macse.fasta" % (outdir, getFileName(sample)), "%s/%s_macse.csv" % (outdir, getFileName(sample))], {"exists": ["%s/%s_NT" % (input_f, getFileName(sample))]}, extraargstring) for sample in samples_list], pool) printVerbose("\tCleaning MACSE alignments") printVerbose("Processing %s samples..." % len(samples_list)) nt_macse_outs = ["%s/%s_NT_macse.fasta" % (outdir, strip_ixes(sample)) for sample in samples_list] # Clean the alignments from classes.PythonRunner import PythonRunner run_parallel([PythonRunner(remove_refs_from_macse_out, [input_, ref, "%s/%s" % (outdir, "%s_cleaned.fasta" % strip_ixes(input_))], {"exists": [input_, ref]}) for input_ in nt_macse_outs], pool) # Cat the cleaned alignments cleaned_alignments = getInputFiles(outdir, "*_cleaned.fasta") merge_files(cleaned_alignments, "%s/MACSE_OUT_MERGED.fasta" % outdir) aux_dir = makeAuxDir(outdir) aux_files = getInputFiles(outdir, "*", "MACSE_OUT_MERGED.fasta", ignore_empty_files=False) bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def clean_trim_adapters_flexbar(self, input_f, adapters, adaptersrc, outdir, allowedns, processes, extraargstring): """Use flexbar to trim adapters and barcodes from sequences. By default, Flexbar does not allow any 'N' \ characters in SEQUENCE, and will toss any sequences that do contain 'N'. To avoid this, use the -u or \ --allowedns flags to specify the maximum number of 'N's to allow :param input_f: Filepath to input file or folder. :param adapters: Filepath to a list of adapters. :param adaptersrc: Filepath to a list of reverse-complemented adapters. :param outdir: Filepath to the output directory. :param allowedns: Non-negative integer value indicating the maximum number of 'N's to tolerate in a sequence. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ inputs = getInputFiles(input_f) pool = init_pool(min(len(inputs), processes)) debugPrintInputInfo(inputs, "trim adapters from") # "flexbar": "flexbar -r \"%s\" -t \"%s\" -ae \"%s\" -a \"%s\"", printVerbose("Trimming barcodes and adapters with flexbar") temp_file_name_template = "%s/temp_%s" debarcoded_file_name_template = "%s/%s_debarcoded" # Trim adapters from the left run_parallel([ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [input_file, temp_file_name_template % (outdir, strip_ixes(input_file)), "LEFT", adapters, allowedns], {"exists": [input_file, adapters]}, extraargstring) for input_file in inputs], pool) temp_files = getInputFiles(outdir, "temp_*") debugPrintInputInfo(temp_files, "trim adapters from") # Trim the reverse complemented adapters from the right run_parallel([ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [input_file, debarcoded_file_name_template % (outdir, strip_ixes(input_file)[5:]), "RIGHT", adaptersrc, allowedns], {"exists": [input_file, adaptersrc]}, extraargstring) for input_file in temp_files], pool) printVerbose("Done Trimming sequences.") # Move temp files aux_files = getInputFiles(outdir, "temp_*", ignore_empty_files=False) bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def rename_chewbacca(self, input_f, outdir, filetype, clip, processes): """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes. :param input_f: Filepath to an input file or folder to rename. :param outdir: Filepath to the output directory. :param filetype: Either 'fasta' or 'fastq'. :param clip: If True, remove dereplication counts from sequence names before renaming. :param processes: The maximum number of processes to use. """ # Gather input files inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "rename") pool = init_pool(min(len(inputs), processes)) printVerbose("Renaming sequences...") # Run serialRename in run_parallel run_parallel([ PythonRunner(serialRename, [ input_, "%s/%s_renamed%s" % (outdir, strip_ixes(input_), os.path.splitext(input_)[1]), filetype, clip ], {"exists": [input_]}) for input_ in inputs ], pool) printVerbose("Done renaming sequences...") samples_dir = makeDirOrdie("%s_samples" % outdir) samples_files = getInputFiles(outdir, "*.samples", ignore_empty_files=False) bulk_move_to_dir(samples_files, samples_dir) aux_dir = makeAuxDir(outdir) aux_files = getInputFiles(outdir, "*.mapping", ignore_empty_files=False) bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def assemble_pear(self, input_f, input_r, outdir, name, processes, pearthreads, extraargstring): """Uses PEAR to assemble paired F/R read files in run_parallel. :param input_f: File path to forward Fastq Reads file or folder. :param input_r: File path to reverse Fastq Reads file or folder. :param outdir: File path to the output directory. :param name: File prefix for the assembled reads. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. :param pearthreads: The number of threads per process to use. """ # "~/programs/pear-0.9.4-bin-64/pear-0.9.4-64 -f %s -r %s -o %s -j %s -m %d" inputs = validate_paired_fastq_reads(input_f, input_r) pool = init_pool(min(len(inputs), processes)) printVerbose("\tAssembling reads with pear") debugPrintInputInfo(inputs, "assemble") run_parallel([ ProgramRunner(ProgramRunnerCommands.ASSEMBLE_PEAR, [ forwards, reverse, "%s/%s_%s" % (outdir, name, getFileName(forwards)), pearthreads ], { "exists": [forwards, reverse], "positive": [pearthreads] }, extraargstring) for forwards, reverse in inputs ], pool) printVerbose("Done assembling sequences...") # Grab all the auxillary files (everything not containing ".assembled." aux_files = getInputFiles(outdir, "*", "*.assembled.*", ignore_empty_files=False) # make aux dir for extraneous files and move them there bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def cluster_crop(self, input_f, outdir, groupsfile, processes, blocksize, clustpct, maxmcmc, maxsm, rare, blockcount, extraargstring): """Clusters sequences using CROP. :param input_f: Filepath to the input fasta file to cluster. :param outdir: Filepath to the output directory. :param groupsfile: Filepath to the groups file to use as a reference for dereplication counting. :param blocksize: Size of blocks to be used for all rounds (if -b is specified, then -z will not affect the first round. For data set with different average sequence length, this parameter should \ be tuned such that it won't take too long for each block to do pariwise alignment. Hint \ for choosing z: z*L<150,000, where L is the average length of the sequences. :param clustpct: The minimum similarity threshold for clustering. Either 'g' for 95% or 's' for 97%. :param maxmcmc: This parameter specifies the number of iterations of MCMC. Default value is 2000. Increase \ this value to enhance accuracy (recommended value is at least 10*block size). :param maxsm: This parameter specifies the maximum number of 'split and merge' process to run. Max is 20. :param rare: The maximum cluster size allowed to be classified as 'rare'. Clusters are defined as either \ 'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \ clusters are mapped to the 'abundant' clusters. Finally, 'rare' clusters which cannot be \ mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \ considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \ believe your data is not too diverse to be handled, then r=0 will be the best choice. :param blockcount: The size of blocks in the first round of clustering. Hint of choosing -b: Each block in the \ first round should contain about 50 sequences. i.e. b=N/50, where N is the number of \ input sequences. Default: # input sequences / z. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ # Grab the fasta file(s) to cluster inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "clustered") pool = init_pool(min(len(inputs), processes)) # RUN CLUSTERING # crop -i %s -o %s -z %s -c %s -e %s -m %s%s run_parallel([ ProgramRunner(ProgramRunnerCommands.CLUSTER_CROP, [ input_, "%s/%s" % (outdir, strip_ixes(input_)), blocksize, clustpct, maxmcmc, maxsm, rare, blockcount ], {"exists": [input_]}, extraargstring) for input_ in inputs ], pool) # CLEAN THE OUTPUT GROUPS FILE printVerbose("Parsing the groups file from clustering") clustered_groups_files = getInputFiles(outdir, "*.cluster.list") debugPrintInputInfo(clustered_groups_files, "converted to groups files") run_parallel([ PythonRunner(parseCROPoutToGroups, [ input_, "%s/%s_uncount.groups" % (outdir, strip_ixes(input_)) ], {"exists": [input_]}) for input_ in clustered_groups_files ], pool) printVerbose("Done parsing groups file.") # Collect the groups file from clustering with counts removed cleaned_clustered_groups_files = getInputFiles( outdir, "*_uncount.groups", ignore_empty_files=False) # Resolve the user specified names file if necessary final_groups_files = handle_groups_file_update( outdir, groupsfile, cleaned_clustered_groups_files) # GATHER AUX FILES input_dir = getDirName(input_f) aux_files = cleaned_clustered_groups_files aux_files += getInputFiles(input_dir, "*.unique", ignore_empty_files=False) aux_files += getInputFiles(input_dir, "*.unique.list", ignore_empty_files=False) aux_files += getInputFiles(input_dir, "*.unique.TempCenters.Rare", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.cluster", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.cluster.list", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.log", ignore_empty_files=False) aux_files += getInputFiles(".", "LikelihoodRatio.txt", ignore_empty_files=False) # Move the final groups file(s) to the groups dir groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(final_groups_files, groups_dir) # Move aux files to the aux dir aux_dir = makeAuxDir(outdir) bulk_move_to_dir(aux_files, aux_dir) # Cleanup the pool cleanup_pool(pool)
def dereplicate_vsearch(self, input_f, outdir, groupsfile, processes, stripcounts, extraargstring): """Dereplicates with vsearch. :param input_f: Filepath to the file or folder of files to dereplicate. :param outdir: Filepath to the output directory. :param groupsfile: A groups file to use as a reference for replicant counting. If no groups file is provided, input sequences are conidered singletons (regardless of their name-annotated dereplication count). :param processes: The number of processes to use to dereplicate the fileset. :param stripcounts: If True, strips the trailing dereplication counts from a file before dereplication. :param extraargstring: Advanced program parameter string. """ inputs = getInputFiles(input_f) pool = init_pool(min(len(inputs), processes)) # REMOVES COUNTS FROM SEQUENCE NAMES IN ORDER TO CLUSTER PROPERLY # strip counts if we need to. if stripcounts: printVerbose("Removing counts from sequence names...") debugPrintInputInfo(inputs, "renamed") run_parallel([ PythonRunner(removeCountsFromFastFile, [ input_, "%s/%s_uncount.fasta" % (outdir, strip_ixes(input_)), 'fasta' ], {"exists": input_}) for input_ in inputs ], pool) printVerbose("Done removing counts.") # Grab the cleaned files as input for the next step inputs = getInputFiles(outdir, "*_uncount.fasta") # DEREPLICATE debugPrintInputInfo(inputs, "dereplicated") printVerbose("Dereplicating...") run_parallel([ ProgramRunner(ProgramRunnerCommands.DEREP_VSEARCH, [ processes, input_, "%s/%s_derep.fasta" % (outdir, strip_ixes(input_)), "%s/%s_uc.out" % (outdir, strip_ixes(input_)) ], { "exists": [input_], "positive": [processes] }, extraargstring) for input_ in inputs ], pool) printVerbose("Done dereplicating") # LOG DEREPLICATED SEQUENCES INTO A .GROUPS FILE # generates a .groups file named _uc_parsed.out # python parseUCtoGroups.py uc.out uc_parsed.out input_ucs = getInputFiles(outdir, "*_uc.out") printVerbose("Generating a groups file from dereplication.") debugPrintInputInfo(inputs, "parsed (into a .groups file)") run_parallel([ PythonRunner( parseUCtoGroups, [input_, "%s/%s_derep.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in input_ucs ], pool) most_recent_groups_files = getInputFiles(outdir, "*_derep.groups", ignore_empty_files=False) # UPDATE THE MOST CURRENT GROUPS FILES WITH DEREPLICATION COUNTS if groupsfile is not None: # Grab the oldgroups file and the dereplicated groups file old_groups_files = getInputFiles(groupsfile) derep_groups_files = getInputFiles(outdir, "*_derep.groups") printVerbose("Updating .groups files with dereplicated data") printVerbose("%d Reference (old) groups files to be read:" % len(old_groups_files)) printVerbose(str(old_groups_files)) printVerbose("%d Dereplicated (new) groups files to be read:" % len(derep_groups_files)) printVerbose(str(derep_groups_files)) update_groups(old_groups_files, derep_groups_files, outdir, "dereplicated") most_recent_groups_files = getInputFiles(outdir, "dereplicated*", ignore_empty_files=False) printVerbose("Done updating .groups files.") if len(inputs) != len(most_recent_groups_files): print( "Error: Number of input fastas (%d) is not equal to the number ofgroups files (%d)." % (len(inputs), len(most_recent_groups_files))) exit() fasta_groups_pairs = zip(inputs, most_recent_groups_files) # ADD COUNT TO SEQUENCE NAMES AND SORT BY COUNT # python renameWithReplicantCounts.py # 8_macse_out/MACSEOUT_MERGED.fasta uc_parsed.out dereplicated_renamed.fasta printVerbose("Adding dereplication data to unique fasta") run_parallel([ PythonRunner(renameWithReplicantCounts, [ fasta, groups, "%s/%s_counts.fasta" % (outdir, strip_ixes(fasta)), 'fasta' ], {"exists": [fasta, groups]}) for fasta, groups in fasta_groups_pairs ], pool) printVerbose("Done adding data") aux_dir = makeAuxDir(outdir) groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(most_recent_groups_files, groups_dir) aux_files = getInputFiles(outdir, '*', "*_counts.fasta", ignore_empty_files=False) bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def cluster_vsearch(self, input_f, outdir, groupsfile, processes, idpct, extraargstring): """Clusters sequences using SWARM. :param input_f: A file or folder containing fasta files to cluster. :param outdir: The output directory results will be written to. :param groupsfile: A groups file or folder containinggroups files that describe the input. Note: if no groups file is supplied, then entries in the fasta file are assumed to be singleton sequences. :param idpct: Real number in the range (0,1] that specifies the minimum simmilarity threshold for clustering. e.g. .95 indicates that a candidate sequence 95% must be at least 95% simmilar to the seed sequence to be included in the cluster. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ # Grab the fasta file(s) to cluster inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "clustered") pool = init_pool(min(len(inputs), processes)) # RUN CLUSTERING # " --cluster_size %s -id %f --centroids %s --uc %s", run_parallel([ProgramRunner(ProgramRunnerCommands.CLUSTER_VSEARCH, [input_, float(idpct), "%s/%s_seeds.fasta" % (outdir, strip_ixes(input_)), "%s/%s_clustered_uc" % (outdir, strip_ixes(input_))], {"exists": [input_]}, extraargstring) for input_ in inputs], pool) # PARSE UC FILE TO GROUPS FILE printVerbose("Parsing the clustered uc files to groups files") clustered_uc_files = getInputFiles(outdir, "*_clustered_uc") debugPrintInputInfo(clustered_uc_files, "parsed to groups") run_parallel([PythonRunner(parseUCtoGroups, [input_, "%s/%s.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in clustered_uc_files], pool) # REMOVE COUNTS FROM CLUSTERING GROUPS FILE printVerbose("Cleaning the .groups file from clustering") # Grab the current groups file and the new clustered groups file (which needs to be cleaned) clustered_groups_files = getInputFiles(outdir, "*_clustered.groups") # Remove counts from the clustering groups files debugPrintInputInfo(clustered_groups_files, "cleaned") run_parallel([PythonRunner(removeCountsFromGroupsFile, [input_, "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in clustered_groups_files], pool) printVerbose("Done cleaning groups files.") # Collect the groups file from clustering with counts removed cleaned_clustered_groups_files = getInputFiles(outdir, "*_uncount.groups", ignore_empty_files=False) # Resolve the user specified names file if necessary final_groups_files = handle_groups_file_update(outdir, groupsfile, cleaned_clustered_groups_files) # Move the final groups file(s) to the groups dir groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(final_groups_files, groups_dir) # Move aux files to the aux dir aux_files = getInputFiles(outdir, "*", "*_seeds.fasta", ignore_empty_files=False) aux_dir = makeAuxDir(outdir) bulk_move_to_dir(aux_files, aux_dir) # Cleanup the pool cleanup_pool(pool)
def cluster_crop(self, input_f, outdir, groupsfile, processes, blocksize, clustpct, maxmcmc, maxsm, rare, blockcount, extraargstring): """Clusters sequences using CROP. :param input_f: Filepath to the input fasta file to cluster. :param outdir: Filepath to the output directory. :param groupsfile: Filepath to the groups file to use as a reference for dereplication counting. :param blocksize: Size of blocks to be used for all rounds (if -b is specified, then -z will not affect the first round. For data set with different average sequence length, this parameter should \ be tuned such that it won't take too long for each block to do pariwise alignment. Hint \ for choosing z: z*L<150,000, where L is the average length of the sequences. :param clustpct: The minimum similarity threshold for clustering. Either 'g' for 95% or 's' for 97%. :param maxmcmc: This parameter specifies the number of iterations of MCMC. Default value is 2000. Increase \ this value to enhance accuracy (recommended value is at least 10*block size). :param maxsm: This parameter specifies the maximum number of 'split and merge' process to run. Max is 20. :param rare: The maximum cluster size allowed to be classified as 'rare'. Clusters are defined as either \ 'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \ clusters are mapped to the 'abundant' clusters. Finally, 'rare' clusters which cannot be \ mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \ considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \ believe your data is not too diverse to be handled, then r=0 will be the best choice. :param blockcount: The size of blocks in the first round of clustering. Hint of choosing -b: Each block in the \ first round should contain about 50 sequences. i.e. b=N/50, where N is the number of \ input sequences. Default: # input sequences / z. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ # Grab the fasta file(s) to cluster inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "clustered") pool = init_pool(min(len(inputs), processes)) # RUN CLUSTERING # crop -i %s -o %s -z %s -c %s -e %s -m %s%s run_parallel([ProgramRunner(ProgramRunnerCommands.CLUSTER_CROP, [input_, "%s/%s" % (outdir, strip_ixes(input_)), blocksize, clustpct, maxmcmc, maxsm, rare, blockcount], {"exists": [input_]}, extraargstring) for input_ in inputs], pool) # CLEAN THE OUTPUT GROUPS FILE printVerbose("Parsing the groups file from clustering") clustered_groups_files = getInputFiles(outdir, "*.cluster.list") debugPrintInputInfo(clustered_groups_files, "converted to groups files") run_parallel([PythonRunner(parseCROPoutToGroups, [input_, "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in clustered_groups_files], pool) printVerbose("Done parsing groups file.") # Collect the groups file from clustering with counts removed cleaned_clustered_groups_files = getInputFiles(outdir, "*_uncount.groups", ignore_empty_files=False) # Resolve the user specified names file if necessary final_groups_files = handle_groups_file_update(outdir, groupsfile, cleaned_clustered_groups_files) # GATHER AUX FILES input_dir = getDirName(input_f) aux_files = cleaned_clustered_groups_files aux_files += getInputFiles(input_dir, "*.unique", ignore_empty_files=False) aux_files += getInputFiles(input_dir, "*.unique.list", ignore_empty_files=False) aux_files += getInputFiles(input_dir, "*.unique.TempCenters.Rare", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.cluster", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.cluster.list", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.log", ignore_empty_files=False) aux_files += getInputFiles(".", "LikelihoodRatio.txt", ignore_empty_files=False) # Move the final groups file(s) to the groups dir groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(final_groups_files, groups_dir) # Move aux files to the aux dir aux_dir = makeAuxDir(outdir) bulk_move_to_dir(aux_files, aux_dir) # Cleanup the pool cleanup_pool(pool)
def cluster_swarm(self, input_f, outdir, groupsfile, processes, extraargstring): """Clusters sequences using SWARM. :param input_f: A file or folder containing fasta files to cluster. :param outdir: The output directory results will be written to. :param groupsfile: A groups file or folder containing groups files that describe the input. Note: if no groups file is supplied, then entries in the fasta file are assumed to be singleton sequences. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ # Grab the fasta file(s) to cluster inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "clustered") pool = init_pool(min(len(inputs), processes)) # RUN CLUSTERING run_parallel([ ProgramRunner(ProgramRunnerCommands.CLUSTER_SWARM, [ input_, "%s/%s_clustered" % (outdir, strip_ixes(input_)), "%s/%s_clustered_uc" % (outdir, strip_ixes(input_)), "%s/%s_clustered_seeds" % (outdir, strip_ixes(input_)) ], {"exists": [input_]}, extraargstring) for input_ in inputs ], pool) # PARSE UC FILE TO GROUPS FILE printVerbose("Parsing the clustered uc files to groups files") clustered_uc_files = getInputFiles(outdir, "*_clustered_uc") debugPrintInputInfo(clustered_uc_files, "parsed to groups") run_parallel([ PythonRunner( parseUCtoGroups, [input_, "%s/%s.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in clustered_uc_files ], pool) printVerbose("Done parsing groups files.") # REMOVE COUNTS FROM CLUSTERING GROUPS FILE printVerbose("Cleaning the .groups file from clustering") # Grab the current groups file and the new clustered groups file (which needs to be cleaned) clustered_groups_files = getInputFiles(outdir, "*_clustered.groups") debugPrintInputInfo(clustered_groups_files, "cleaned") run_parallel([ PythonRunner(removeCountsFromGroupsFile, [ input_, "%s/%s_uncount.groups" % (outdir, strip_ixes(input_)) ], {"exists": [input_]}) for input_ in clustered_groups_files ], pool) printVerbose("Done cleaning groups files.") printVerbose("Capitalizing sequences") # Convert the seeds files to uppercase (swarm writes in lowercase) inputs = getInputFiles(outdir, "*_seeds") run_parallel([ PythonRunner(capitalize_seqs, [input_, "%s.fasta" % input_], {"exists": [input_]}) for input_ in inputs ], pool) printVerbose("Done capitalizing sequences") # Collect the groups file from clustering with counts removed cleaned_clustered_groups_files = getInputFiles( outdir, "*_uncount.groups", ignore_empty_files=False) # Resolve the user specified names file if necessary final_groups_files = handle_groups_file_update( outdir, groupsfile, cleaned_clustered_groups_files) # Move the final groups file(s) to the groups dir groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(final_groups_files, groups_dir) # Move aux files to the aux dir aux_files = getInputFiles(outdir, "*", "*_seeds.fasta", ignore_empty_files=False) aux_dir = makeAuxDir(outdir) bulk_move_to_dir(aux_files, aux_dir) # Cleanup the pool cleanup_pool(pool)
def dereplicate_vsearch(self, input_f, outdir, groupsfile, processes, stripcounts, extraargstring): """Dereplicates with vsearch. :param input_f: Filepath to the file or folder of files to dereplicate. :param outdir: Filepath to the output directory. :param groupsfile: A groups file to use as a reference for replicant counting. If no groups file is provided, input sequences are conidered singletons (regardless of their name-annotated dereplication count). :param processes: The number of processes to use to dereplicate the fileset. :param stripcounts: If True, strips the trailing dereplication counts from a file before dereplication. :param extraargstring: Advanced program parameter string. """ inputs = getInputFiles(input_f) pool = init_pool(min(len(inputs), processes)) # REMOVES COUNTS FROM SEQUENCE NAMES IN ORDER TO CLUSTER PROPERLY # strip counts if we need to. if stripcounts: printVerbose("Removing counts from sequence names...") debugPrintInputInfo(inputs, "renamed") run_parallel([PythonRunner(removeCountsFromFastFile, [input_, "%s/%s_uncount.fasta" % (outdir, strip_ixes(input_)), 'fasta'], {"exists": input_}) for input_ in inputs], pool) printVerbose("Done removing counts.") # Grab the cleaned files as input for the next step inputs = getInputFiles(outdir, "*_uncount.fasta") # DEREPLICATE debugPrintInputInfo(inputs, "dereplicated") printVerbose("Dereplicating...") run_parallel([ProgramRunner(ProgramRunnerCommands.DEREP_VSEARCH, [processes, input_, "%s/%s_derep.fasta" % (outdir, strip_ixes(input_)), "%s/%s_uc.out" % (outdir, strip_ixes(input_))], {"exists": [input_], "positive": [processes]}, extraargstring) for input_ in inputs], pool) printVerbose("Done dereplicating") # LOG DEREPLICATED SEQUENCES INTO A .GROUPS FILE # generates a .groups file named _uc_parsed.out # python parseUCtoGroups.py uc.out uc_parsed.out input_ucs = getInputFiles(outdir, "*_uc.out") printVerbose("Generating a groups file from dereplication.") debugPrintInputInfo(inputs, "parsed (into a .groups file)") run_parallel([PythonRunner(parseUCtoGroups, [input_, "%s/%s_derep.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in input_ucs], pool) most_recent_groups_files = getInputFiles(outdir, "*_derep.groups", ignore_empty_files=False) # UPDATE THE MOST CURRENT GROUPS FILES WITH DEREPLICATION COUNTS if groupsfile is not None: # Grab the oldgroups file and the dereplicated groups file old_groups_files = getInputFiles(groupsfile) derep_groups_files = getInputFiles(outdir, "*_derep.groups") printVerbose("Updating .groups files with dereplicated data") printVerbose("%d Reference (old) groups files to be read:" % len(old_groups_files)) printVerbose(str(old_groups_files)) printVerbose("%d Dereplicated (new) groups files to be read:" % len(derep_groups_files)) printVerbose(str(derep_groups_files)) update_groups(old_groups_files, derep_groups_files, outdir, "dereplicated") most_recent_groups_files = getInputFiles(outdir, "dereplicated*", ignore_empty_files=False) printVerbose("Done updating .groups files.") if len(inputs) != len(most_recent_groups_files): print ("Error: Number of input fastas (%d) is not equal to the number ofgroups files (%d)." % (len(inputs), len(most_recent_groups_files))) exit() fasta_groups_pairs = zip(inputs, most_recent_groups_files) # ADD COUNT TO SEQUENCE NAMES AND SORT BY COUNT # python renameWithReplicantCounts.py # 8_macse_out/MACSEOUT_MERGED.fasta uc_parsed.out dereplicated_renamed.fasta printVerbose("Adding dereplication data to unique fasta") run_parallel([PythonRunner(renameWithReplicantCounts, [fasta, groups, "%s/%s_counts.fasta" % (outdir, strip_ixes(fasta)), 'fasta'], {"exists": [fasta, groups]}) for fasta, groups in fasta_groups_pairs], pool) printVerbose("Done adding data") aux_dir = makeAuxDir(outdir) groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(most_recent_groups_files, groups_dir) aux_files = getInputFiles(outdir, '*', "*_counts.fasta", ignore_empty_files=False) bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)