def query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string, extraargstring, pool): """Runs a VSEARCH alignment on pairs of query/reference sequences. :param inputs: A list of pairs of (filepaths to) query_fastas and the refrence fastas to compare them to. :param outdir: Filepath to the directory where the alignment result should be written. :param aln_user_string: An optional string of commandline parameters passed to the VSEARCH program. :param simmilarity: The minimum simmilarity percentage (between reference and query sequences), \ as a decimal between 0 and 1), required for a positive match. :param processes: The number of processes to use in the identification process. :param extraargstring: Advanced program parameter string. :param pool: A fully initalized multiprocessing.Pool object. """ printVerbose("Aligning against reference sequences...") # # vsearch --usearch_global %s seeds.pick.fasta --db ../data/BiocodePASSED_SAP.txt --id 0.9 \ # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt run_parallel([ ProgramRunner(ProgramRunnerCommands.ALIGN_VSEARCH, [ processes, query_fasta, ref_fasta, simmilarity, "%s/%s.out" % (outdir, strip_ixes(query_fasta)), "%s/%s.alnout" % (outdir, strip_ixes(query_fasta)), aln_user_string ], { "exists": [query_fasta, ref_fasta], "positive": [processes] }, extraargstring) for query_fasta, ref_fasta in inputs ], pool) printVerbose("Done aligning.") return
def query_fasta_vsearch(self, input_f, referencefasta, taxinfo, outdir, processes, simmilarity, coverage, extraargstring): """Compare reference sequences to the fasta-formatted query sequences, using global pairwise alignment. :param input_f: Filepath to a file or folder of files to identify. :param outdir: Filepath to the output directory. :param referencefasta: Filepath to a file or folder of files to use as a reference. :param taxinfo: Filepath to a file containing taxonomic info correlated with the referencefasta. :param simmilarity: The % simmilarity between a query and reference sequence required for positive identification. :param coverage: The % coverage of matching regions between a query and reference sequence required for positive identification. :param processes: The number of processes to use in the identification process. :param extraargstring: Advanced program parameter string. """ # vsearch --usearch_global %s seeds.pick.fasta --db ../data/BiocodePASSED_SAP.txt --id 0.9 \ # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt # expecting a fasta to annotate query_fastas = getInputFiles(input_f) debugPrintInputInfo(query_fastas, "queried for identification.") ref_fastas = getInputFiles(referencefasta) debugPrintInputInfo(ref_fastas, "referenced for sequence identification.") tax_info_files = getInputFiles(taxinfo) debugPrintInputInfo(tax_info_files, "referenced for taxanomic names.") # make sure the number of reference fasta files is the same as the number of tax_info files if len(tax_info_files) != len(ref_fastas): print "Error: The number of reference fastas and taxonomic mapping files is not the same. There must be \ one taxonomic mapping file for each reference fasta." return ref_data_pairs = zip(ref_fastas, tax_info_files) inputs = [x for x in product(query_fastas, ref_fastas)] aln_user_string = "" pool = init_pool(min(len(inputs), processes)) # VSEARCH ALIGNMENT query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string, extraargstring, pool) printVerbose("Parsing output...") # Parse the alignment results and put those that pass the criterion (97 similarity, 85 coverage) in # parsed_BIOCODE.out. Parameters can be changed and this command can be rerun as many times as necessary # # parseVSearchOutputAgainstFasta(vsearch_outfile, taxInfo, output_file, min_simmilarity, min_coverage): inputs = [x for x in product(query_fastas, ref_data_pairs)] debugPrintInputInfo(inputs, "queryied against paired refereces.") run_parallel([PythonRunner(parseVSearchOutputAgainstFasta, ["%s/%s.out" % (outdir, strip_ixes(query)), tax_info, "%s/%s.tax" % (outdir, strip_ixes(query)), simmilarity, coverage], {"exists": [query, ref_fasta, tax_info]}) for query, (ref_fasta, tax_info) in inputs], pool) printVerbose("\nDone parsing...") # Gather and move auxillary files aux_files = getInputFiles(outdir, "*", "*.tax", ignore_empty_files=False) bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def query_fasta_db_vsearch(self, input_f, outdir, ref_fasta, ref_db, simmilarity, coverage, processes, extraargstring): """Compare reference sequences to the fasta-formatted query sequences, using global pairwise alignment. :param input_f: Filepath to a file or folder of files to identify. :param outdir: Filepath to the output directory. :param ref_fasta: Filepath to the curated fasta file to use as a reference. :param ref_db: Filepath to the curated fasta file to use as a reference. :param simmilarity:"Minimum % simmilarity (decimal between 0 and 1) between query and reference sequences required for positive identification. :param coverage:Minimum % coverage (decimal between 0 and 1) required query and reference sequences required for positive identification. :param processes: The number of processes to use in the identification process. :param extraargstring: Advanced program parameter string. """ # blast6 output format http://www.drive5.com/usearch/manual/blast6out.html aln_user_string = "--userfields query+target+id+alnlen+qcov" # coi_fasta = os.path.expanduser("~/ARMS/refs/COI.fasta") # ncbi_db_string = os.path.expanduser("~/ARMS/refs/ncbi.db") coi_fasta = ref_fasta ncbi_db_string = ref_db query_fastas = getInputFiles(input_f) debugPrintInputInfo(query_fastas, "queried against the DB.") inputs = [x for x in product(query_fastas, [coi_fasta])] pool = init_pool(min(len(query_fastas), processes)) # VSEARCH ALIGNMENT query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string, extraargstring, pool) printVerbose("Parsing output...") # Parse the alignment results and put those that pass the criterion (97 similarity, 85 coverage) in # parsed_BIOCODE.out. Parameters can be changed and this command can be rerun as many times as necessary # # parseVSearchOutputAgainstNCBI(vsearch_out, ncbi_db, min_coverage, min_similarity)> parsed_nt.out run_parallel([ PythonRunner(parseVSearchOutputAgainstNCBI, [ "%s/%s.out" % (outdir, strip_ixes(query)), ncbi_db_string, "%s/%s.tax" % (outdir, strip_ixes(query)), simmilarity, coverage ], {"exits": [query, ncbi_db_string]}) for query in query_fastas ], pool) printVerbose("Done processing.") # Gather and move auxillary files aux_files = getInputFiles(outdir, "*", "*.tax", ignore_empty_files=False) bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def clean_trim_adapters_flexbar(self, input_f, adapters, adaptersrc, outdir, allowedns, processes, extraargstring): """Use flexbar to trim adapters and barcodes from sequences. By default, Flexbar does not allow any 'N' \ characters in SEQUENCE, and will toss any sequences that do contain 'N'. To avoid this, use the -u or \ --allowedns flags to specify the maximum number of 'N's to allow :param input_f: Filepath to input file or folder. :param adapters: Filepath to a list of adapters. :param adaptersrc: Filepath to a list of reverse-complemented adapters. :param outdir: Filepath to the output directory. :param allowedns: Non-negative integer value indicating the maximum number of 'N's to tolerate in a sequence. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ inputs = getInputFiles(input_f) pool = init_pool(min(len(inputs), processes)) debugPrintInputInfo(inputs, "trim adapters from") # "flexbar": "flexbar -r \"%s\" -t \"%s\" -ae \"%s\" -a \"%s\"", printVerbose("Trimming barcodes and adapters with flexbar") temp_file_name_template = "%s/temp_%s" debarcoded_file_name_template = "%s/%s_debarcoded" # Trim adapters from the left run_parallel([ ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [ input_file, temp_file_name_template % (outdir, strip_ixes(input_file)), "LEFT", adapters, allowedns ], {"exists": [input_file, adapters]}, extraargstring) for input_file in inputs ], pool) temp_files = getInputFiles(outdir, "temp_*") debugPrintInputInfo(temp_files, "trim adapters from") # Trim the reverse complemented adapters from the right run_parallel([ ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [ input_file, debarcoded_file_name_template % (outdir, strip_ixes(input_file)[5:]), "RIGHT", adaptersrc, allowedns ], {"exists": [input_file, adaptersrc]}, extraargstring) for input_file in temp_files ], pool) printVerbose("Done Trimming sequences.") # Move temp files aux_files = getInputFiles(outdir, "temp_*", ignore_empty_files=False) bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def rename_chewbacca(self, input_f, outdir, filetype, clip, processes): """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes. :param input_f: Filepath to an input file or folder to rename. :param outdir: Filepath to the output directory. :param filetype: Either 'fasta' or 'fastq'. :param clip: If True, remove dereplication counts from sequence names before renaming. :param processes: The maximum number of processes to use. """ # Gather input files inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "rename") pool = init_pool(min(len(inputs), processes)) printVerbose("Renaming sequences...") # Run serialRename in run_parallel run_parallel([PythonRunner(serialRename, [input_, "%s/%s_renamed%s" % (outdir, strip_ixes(input_), os.path.splitext(input_)[1]), filetype, clip], {"exists": [input_]}) for input_ in inputs], pool) printVerbose("Done renaming sequences...") samples_dir = makeDirOrdie("%s_samples" % outdir) samples_files = getInputFiles(outdir, "*.samples", ignore_empty_files=False) bulk_move_to_dir(samples_files, samples_dir) aux_dir = makeAuxDir(outdir) aux_files = getInputFiles(outdir, "*.mapping", ignore_empty_files=False) bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def clean_quality_trimmomatic(self, input_f, outdir, window_size, quality, min_len, processes, extraargstring): """Uses a sliding window to identify and trim away areas of low quality. :param input_f: Filepath to input file or folder. :param outdir: Filepath to the output directory. :param window_size: Width of the sliding window. (Number of consecutive base-pairs to average for quality \ analysis). :param quality: Minimum quality allowed. Sections with lower average quality than this will be dropped. :param min_len: Minimum allowed length for TRIMMED sequences. (i.e. if a sequence is too short after trimming, its dropped.) :param processes: Number of processes to use to clean the input fileset. """ # "trimomatic": "java -jar ~/ARMS/programs/Trimmomatic-0.33/trimmomatic-0.33.jar SE \ # -%phred %input %output SLIDINGWINDOW:%windowsize:%minAvgQuality MINLEN:%minLen" inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "clean") pool = init_pool(min(len(inputs), processes)) printVerbose("Cleaning sequences with Trimmomatic...") run_parallel([ProgramRunner(ProgramRunnerCommands.CLEAN_TRIMMOMATIC, [input_, "%s/%s_cleaned.fastq" % (outdir, strip_ixes(input_)), window_size, quality, min_len], {"exists": [outdir, input_], "positive": [window_size, quality, min_len]}, extraargstring) for input_ in inputs], pool) printVerbose("Done cleaning sequences.") cleanup_pool(pool)
def clean_quality_trimmomatic(self, input_f, outdir, window_size, quality, min_len, processes, extraargstring): """Uses a sliding window to identify and trim away areas of low quality. :param input_f: Filepath to input file or folder. :param outdir: Filepath to the output directory. :param window_size: Width of the sliding window. (Number of consecutive base-pairs to average for quality \ analysis). :param quality: Minimum quality allowed. Sections with lower average quality than this will be dropped. :param min_len: Minimum allowed length for TRIMMED sequences. (i.e. if a sequence is too short after trimming, its dropped.) :param processes: Number of processes to use to clean the input fileset. """ # "trimomatic": "java -jar ~/ARMS/programs/Trimmomatic-0.33/trimmomatic-0.33.jar SE \ # -%phred %input %output SLIDINGWINDOW:%windowsize:%minAvgQuality MINLEN:%minLen" inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "clean") pool = init_pool(min(len(inputs), processes)) printVerbose("Cleaning sequences with Trimmomatic...") run_parallel([ ProgramRunner( ProgramRunnerCommands.CLEAN_TRIMMOMATIC, [ input_, "%s/%s_cleaned.fastq" % (outdir, strip_ixes(input_)), window_size, quality, min_len ], { "exists": [outdir, input_], "positive": [window_size, quality, min_len] }, extraargstring) for input_ in inputs ], pool) printVerbose("Done cleaning sequences.") cleanup_pool(pool)
def preclean_bayeshammer(self, input_f, input_r, outdir, processes, bayesthreads, extraargstring): """Assembles reads from two (left and right) fastq files/directories. :param input_f: File path to file or folder of left reads to clean. :param input_r: File path to file or folder of right reads to clean. :param outdir: Filepath to output directory. :param bayesthreads: The number of threads per process to use. :param processes: The maximum number of processes to use. :param kmerlen: The kmer length to use. Default: 16. :param extraargstring: Advanced program parameter string. """ # Collect input files, and validate that they match inputs = validate_paired_fastq_reads(input_f, input_r) pool = init_pool(min(len(inputs), processes)) printVerbose("\tPrecleaning %s reads with Spades-Baye's Hammer..." % len(inputs)) debugPrintInputInfo(inputs, "preclean/fix.") run_parallel([ ProgramRunner(ProgramRunnerCommands.PRECLEAN_SPADES, [forwards, reverse, outdir, bayesthreads], { "exists": [forwards, reverse], "positive": [bayesthreads] }, extraargstring) for forwards, reverse in inputs ], pool) printVerbose("Done cleaning reads.") # Grab all the auxillary files (everything not containing ".assembled." # aux_files = getInputFiles(outdir, "*", "*.assembled.*", ignore_empty_files=False) # make aux dir for extraneous files and move them there # bulk_move_to_dir(aux_files, makeAuxDir(outdir)) # Select output files aux_files = getInputFiles(outdir, "*", ignore_empty_files=False) corrected_dir = "%s/corrected" % outdir bulk_move_to_dir(getInputFiles(corrected_dir, "*"), outdir) aux_files += getInputFiles(outdir, "*unpaired*", ignore_empty_files=False) aux_files += getInputFiles(outdir, "configs", ignore_empty_files=False) # Gather aux files aux_dir = makeAuxDir(outdir) bulk_move_to_dir(aux_files, aux_dir) # Rename output files output_files = getInputFiles(outdir, "*", "corrected_*") for out_file in output_files: move(out_file, "%s/%s_corrected.fastq" % (outdir, strip_ixes(out_file))) # move the last minute log file try: move("%s/corrected_corrected.fastq" % outdir, "%s/corrected_corrected.fastq" % aux_dir) except: pass cleanup_pool(pool)
def align_clean_macse(self, input_f, ref, samplesdir, outdir, processes, extraargstring=""): """Removes non-nucleotide characters in MACSE aligned sequences for all fasta files in the samples directory (the samplesDir argument). :param input_f: File path to file or folder of files to clean. :param samplesdir: Filepath to the original, unaligned input files (the inputs to the macse aligner). :param ref: Filepath to the reference file used to align the input files. :param outdir: Filepath to the directory to write outputs to. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ # "macse_format": "java -jar " + programPaths["MACSE"] + " -prog exportAlignment -align \"%s\" \ # -charForRemainingFS - -gc_def 5 -out_AA \"%s\" -out_NT \"%s\" -statFile \"%s\"" inputs = getInputFiles(input_f) pool = init_pool(min(len(inputs), processes)) printVerbose("\t %s Processing MACSE alignments") samples_list = getInputFiles(samplesdir) run_parallel([ProgramRunner(ProgramRunnerCommands.MACSE_FORMAT, ["%s/%s_NT" % (input_f, getFileName(sample)), "%s/%s_AA_macse.fasta" % (outdir, getFileName(sample)), "%s/%s_NT_macse.fasta" % (outdir, getFileName(sample)), "%s/%s_macse.csv" % (outdir, getFileName(sample))], {"exists": ["%s/%s_NT" % (input_f, getFileName(sample))]}, extraargstring) for sample in samples_list], pool) printVerbose("\tCleaning MACSE alignments") printVerbose("Processing %s samples..." % len(samples_list)) nt_macse_outs = ["%s/%s_NT_macse.fasta" % (outdir, strip_ixes(sample)) for sample in samples_list] # Clean the alignments from classes.PythonRunner import PythonRunner run_parallel([PythonRunner(remove_refs_from_macse_out, [input_, ref, "%s/%s" % (outdir, "%s_cleaned.fasta" % strip_ixes(input_))], {"exists": [input_, ref]}) for input_ in nt_macse_outs], pool) # Cat the cleaned alignments cleaned_alignments = getInputFiles(outdir, "*_cleaned.fasta") merge_files(cleaned_alignments, "%s/MACSE_OUT_MERGED.fasta" % outdir) aux_dir = makeAuxDir(outdir) aux_files = getInputFiles(outdir, "*", "MACSE_OUT_MERGED.fasta", ignore_empty_files=False) bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def get_best_hits_from_vsearch(input_fna, ref_fna, outdir, id_pct=.7): """Calls vsearch with an input fasta, and returns a dictionary mapping each sequence to its best hit. (subject to the ID threshold (70%) in vsearch (See ProgramRunnerCommands.ALIGN_VSEARCH). :param input_fna: string. Filepath to the input fna fasta file. :param ref_fna: string. Filepath to the reference fna fasta file. :param outdir: string. Filepath to the output directory for the hits file. :return: {string:string} A dictionary mapping input sequence names to the best hit in the reference DB. """ def best_hits_from_vsearch(v_search_output): best_hits = {} for line in open(v_search_output, 'r'): data = line.split("\t") query_name = data[0].rstrip() if best_hits.has_key(query_name): if float(best_hits[query_name][2].rstrip()) < float( data[2].rstrip()): best_hits[query_name] = data else: best_hits[query_name] = data return best_hits threads = 1 pool = init_pool(threads) #printVerbose.VERBOSE = True print "calling vsearch" processes = 1 aln_user_string = "" extraargstring = "" printVerbose("Aligning against reference sequences...") # # vsearch --usearch_global %s seeds.pick.fasta --db ../data/BiocodePASSED_SAP.txt --id 0.9 \ # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt ProgramRunner(ProgramRunnerCommands.ALIGN_VSEARCH, [ processes, input_fna, ref_fna, id_pct, "%s/%s.out" % (outdir, strip_ixes(input_fna)), "%s/%s.alnout" % (outdir, strip_ixes(input_fna)), aln_user_string ], { "exists": [input_fna, ref_fna], "positive": [processes] }, extraargstring).run() vsearch_output = "%s/%s.out" % (outdir, strip_ixes(input_fna)) # Choose the best hit return best_hits_from_vsearch(vsearch_output)
def serialRename(input_file, output_fasta_filepath, file_type, clip=True): """Takes in a fasta file and outputs a new fasta with the sequences renamed. Renaming convention is x.y.z<n> for x.y.z.fasta, where n is an integer in the range [0:n] where n is the position of the sequence in the input_file. Also writes a groups file, linking each sequence to its parent sample. e.g. The sequences in SiteX_SampleA.fasta are renamed: SiteX_SampleA_0, SiteX_SampleA_1, SiteX_SampleA_2, etc. :param input_file: Input fasta or fastq file. :param output_fasta_filepath: Filepath for the output .samples file. :param file_type: "fasta" or "fastq" :param clip: True if filenames contain file_ID#s. Will clip the IDs before renaming to get proper sequence names. """ samples_file = "%s/%s_renamed.samples" % ( os.path.dirname(output_fasta_filepath), strip_ixes(input_file)) name_map_file = "%s/%s_renamed.mapping" % ( os.path.dirname(output_fasta_filepath), strip_ixes(input_file)) seq_prefix = strip_ixes(input_file) i = 0 renamed_fasta = BufferedSeqWriter(output_fasta_filepath, file_type) mapping_file_output = BufferedFileWriter(name_map_file) samples_file_output = BufferedFileWriter(samples_file) for s in SeqIO.parse(input_file, file_type): i += 1 # Store the old_name new_name mapping old_id = s.id s.id = "%s_ID%s" % (seq_prefix, i) mapping_file_output.write("%s\t%s" % (old_id, s.id)) # Store the sequence-sample map if clip: sample_name = clip_count(seq_prefix) else: sample_name = seq_prefix samples_file_output.write("%s\t%s" % (s.id, sample_name)) # Store the renamed sequence s.description = "" renamed_fasta.write(s) renamed_fasta.flush() mapping_file_output.flush() samples_file_output.flush()
def get_best_hits_from_vsearch(input_fna, ref_fna, outdir, id_pct=.7): """Calls vsearch with an input fasta, and returns a dictionary mapping each sequence to its best hit. (subject to the ID threshold (70%) in vsearch (See ProgramRunnerCommands.ALIGN_VSEARCH). :param input_fna: string. Filepath to the input fna fasta file. :param ref_fna: string. Filepath to the reference fna fasta file. :param outdir: string. Filepath to the output directory for the hits file. :return: {string:string} A dictionary mapping input sequence names to the best hit in the reference DB. """ def best_hits_from_vsearch(v_search_output): best_hits = {} for line in open(v_search_output, 'r'): data = line.split("\t") query_name = data[0].rstrip() if best_hits.has_key(query_name): if float(best_hits[query_name][2].rstrip()) < float(data[2].rstrip()): best_hits[query_name] = data else: best_hits[query_name] = data return best_hits threads = 1 pool = init_pool(threads) #printVerbose.VERBOSE = True print "calling vsearch" processes=1 aln_user_string="" extraargstring="" printVerbose("Aligning against reference sequences...") # # vsearch --usearch_global %s seeds.pick.fasta --db ../data/BiocodePASSED_SAP.txt --id 0.9 \ # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt ProgramRunner(ProgramRunnerCommands.ALIGN_VSEARCH, [processes, input_fna, ref_fna, id_pct, "%s/%s.out" % (outdir, strip_ixes(input_fna)), "%s/%s.alnout" % (outdir, strip_ixes(input_fna)), aln_user_string], {"exists": [input_fna, ref_fna], "positive": [processes]}, extraargstring).run() vsearch_output = "%s/%s.out" % (outdir, strip_ixes(input_fna)) # Choose the best hit return best_hits_from_vsearch(vsearch_output)
def get_best_hits_from_vsearch(input_fna, ref_fna, outdir): def best_hits_from_vsearch(v_search_output): best_hits = {} for line in open(v_search_output, 'r'): data = line.split("\t") query_name = data[0].rstrip() if best_hits.has_key(query_name): if float(best_hits[query_name][2].rstrip()) < float(data[2].rstrip()): best_hits[query_name] = data else: best_hits[query_name] = data return best_hits threads = 1 pool = init_pool(threads) #printVerbose.VERBOSE = True print "calling vsearch" # Search for good hits inputs = [(input_fna, ref_fna)] processes=1 aln_user_string="" extraargstring="" printVerbose("Aligning against reference sequences...") # # vsearch --usearch_global %s seeds.pick.fasta --db ../data/BiocodePASSED_SAP.txt --id 0.9 \ # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt ProgramRunner(ProgramRunnerCommands.ALIGN_VSEARCH, [processes, input_fna, ref_fna, "%s/%s.out" % (outdir, strip_ixes(input_fna)), "%s/%s.alnout" % (outdir, strip_ixes(input_fna)), aln_user_string], {"exists": [input_fna, ref_fna], "positive": [processes]}, extraargstring).run() print "cleaning up." vsearch_output = "%s/%s.out" % (outdir, strip_ixes(input_fna)) # Choose the best hit return best_hits_from_vsearch(vsearch_output)
def serialRename(input_file, output_fasta_filepath, file_type, clip=True): """Takes in a fasta file and outputs a new fasta with the sequences renamed. Renaming convention is x.y.z<n> for x.y.z.fasta, where n is an integer in the range [0:n] where n is the position of the sequence in the input_file. Also writes a groups file, linking each sequence to its parent sample. e.g. The sequences in SiteX_SampleA.fasta are renamed: SiteX_SampleA_0, SiteX_SampleA_1, SiteX_SampleA_2, etc. :param input_file: Input fasta or fastq file. :param output_fasta_filepath: Filepath for the output .samples file. :param file_type: "fasta" or "fastq" :param clip: True if filenames contain file_ID#s. Will clip the IDs before renaming to get proper sequence names. """ samples_file = "%s/%s_renamed.samples" % (os.path.dirname(output_fasta_filepath), strip_ixes(input_file)) name_map_file = "%s/%s_renamed.mapping" % (os.path.dirname(output_fasta_filepath), strip_ixes(input_file)) seq_prefix = strip_ixes(input_file) i = 0 renamed_fasta = BufferedSeqWriter(output_fasta_filepath, file_type) mapping_file_output = BufferedFileWriter(name_map_file) samples_file_output = BufferedFileWriter(samples_file) for s in SeqIO.parse(input_file, file_type): i += 1 # Store the old_name new_name mapping old_id = s.id s.id = "%s_ID%s" % (seq_prefix, i) mapping_file_output.write("%s\t%s" % (old_id, s.id)) # Store the sequence-sample map if clip: sample_name = clip_count(seq_prefix) else: sample_name = seq_prefix samples_file_output.write("%s\t%s" % (s.id, sample_name)) # Store the renamed sequence s.description = "" renamed_fasta.write(s) renamed_fasta.flush() mapping_file_output.flush() samples_file_output.flush()
def clean_trim_adapters_flexbar(self, input_f, adapters, adaptersrc, outdir, allowedns, processes, extraargstring): """Use flexbar to trim adapters and barcodes from sequences. By default, Flexbar does not allow any 'N' \ characters in SEQUENCE, and will toss any sequences that do contain 'N'. To avoid this, use the -u or \ --allowedns flags to specify the maximum number of 'N's to allow :param input_f: Filepath to input file or folder. :param adapters: Filepath to a list of adapters. :param adaptersrc: Filepath to a list of reverse-complemented adapters. :param outdir: Filepath to the output directory. :param allowedns: Non-negative integer value indicating the maximum number of 'N's to tolerate in a sequence. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ inputs = getInputFiles(input_f) pool = init_pool(min(len(inputs), processes)) debugPrintInputInfo(inputs, "trim adapters from") # "flexbar": "flexbar -r \"%s\" -t \"%s\" -ae \"%s\" -a \"%s\"", printVerbose("Trimming barcodes and adapters with flexbar") temp_file_name_template = "%s/temp_%s" debarcoded_file_name_template = "%s/%s_debarcoded" # Trim adapters from the left run_parallel([ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [input_file, temp_file_name_template % (outdir, strip_ixes(input_file)), "LEFT", adapters, allowedns], {"exists": [input_file, adapters]}, extraargstring) for input_file in inputs], pool) temp_files = getInputFiles(outdir, "temp_*") debugPrintInputInfo(temp_files, "trim adapters from") # Trim the reverse complemented adapters from the right run_parallel([ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [input_file, debarcoded_file_name_template % (outdir, strip_ixes(input_file)[5:]), "RIGHT", adaptersrc, allowedns], {"exists": [input_file, adaptersrc]}, extraargstring) for input_file in temp_files], pool) printVerbose("Done Trimming sequences.") # Move temp files aux_files = getInputFiles(outdir, "temp_*", ignore_empty_files=False) bulk_move_to_dir(aux_files, makeAuxDir(outdir)) cleanup_pool(pool)
def ungap_chewbacca(self, input_f, outdir, gapchars, file_ext, processes): """Ungaps a character using Bio python. :param input_f: Filepath to input file or folder to ungap. :param outdir: Filepath to the output directory where ungapped files should be written. :param gapchars: A string containing the gap characters to remove. :param file_ext: Either 'fasta' or 'fastq'. :param processes: The number of threads to use to ungap the input fileset. """ inputs = getInputFiles(input_f, "*.fasta") debugPrintInputInfo(inputs, "ungap.") pool = init_pool(min(len(inputs), processes)) printVerbose("Removing all '%s' from sequences..." % gapchars) # ungap(file_to_clean, output_file_name, gap_char, file_type): run_parallel([PythonRunner(remove_gap_chars, [input_, "%s/%s_cleaned.%s" % (outdir, strip_ixes(input_), 'fasta'), gapchars, file_ext], {"exists": [input_]}) for input_ in inputs], pool) printVerbose("Done removing.") cleanup_pool(pool)
def partition_chewbacca(self, input_f, outdir, processes, chunksize, filetype): """Partition a fasta/fastq file into chunks of user-defined size. :param input_f: Filepath to a file or folder of files to partition. :param outdir: The directory to write split files to. :param processes: The number of processes to use to partition the input fileset. :param chunksize: The number of sequences per file. :param filetype: Either 'fasta' or 'fastq'. """ # Gather input files inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "partitioned") pool = init_pool(min(len(inputs), processes)) printVerbose("Partitioning Files...") run_parallel([PythonRunner(splitK, [input_, "%s/%s" % (outdir, strip_ixes(input_)), chunksize, filetype], {"exists": [input_]}) for input_ in inputs], pool) printVerbose("Done partitioning files.") cleanup_pool(pool)
def rename_chewbacca(self, input_f, outdir, filetype, clip, processes): """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes. :param input_f: Filepath to an input file or folder to rename. :param outdir: Filepath to the output directory. :param filetype: Either 'fasta' or 'fastq'. :param clip: If True, remove dereplication counts from sequence names before renaming. :param processes: The maximum number of processes to use. """ # Gather input files inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "rename") pool = init_pool(min(len(inputs), processes)) printVerbose("Renaming sequences...") # Run serialRename in run_parallel run_parallel([ PythonRunner(serialRename, [ input_, "%s/%s_renamed%s" % (outdir, strip_ixes(input_), os.path.splitext(input_)[1]), filetype, clip ], {"exists": [input_]}) for input_ in inputs ], pool) printVerbose("Done renaming sequences...") samples_dir = makeDirOrdie("%s_samples" % outdir) samples_files = getInputFiles(outdir, "*.samples", ignore_empty_files=False) bulk_move_to_dir(samples_files, samples_dir) aux_dir = makeAuxDir(outdir) aux_files = getInputFiles(outdir, "*.mapping", ignore_empty_files=False) bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def ungap_chewbacca(self, input_f, outdir, gapchars, file_ext, processes): """Ungaps a character using Bio python. :param input_f: Filepath to input file or folder to ungap. :param outdir: Filepath to the output directory where ungapped files should be written. :param gapchars: A string containing the gap characters to remove. :param file_ext: Either 'fasta' or 'fastq'. :param processes: The number of threads to use to ungap the input fileset. """ inputs = getInputFiles(input_f, "*.fasta") debugPrintInputInfo(inputs, "ungap.") pool = init_pool(min(len(inputs), processes)) printVerbose("Removing all '%s' from sequences..." % gapchars) # ungap(file_to_clean, output_file_name, gap_char, file_type): run_parallel([ PythonRunner(remove_gap_chars, [ input_, "%s/%s_cleaned.%s" % (outdir, strip_ixes(input_), 'fasta'), gapchars, file_ext ], {"exists": [input_]}) for input_ in inputs ], pool) printVerbose("Done removing.") cleanup_pool(pool)
def partition_chewbacca(self, input_f, outdir, processes, chunksize, filetype): """Partition a fasta/fastq file into chunks of user-defined size. :param input_f: Filepath to a file or folder of files to partition. :param outdir: The directory to write split files to. :param processes: The number of processes to use to partition the input fileset. :param chunksize: The number of sequences per file. :param filetype: Either 'fasta' or 'fastq'. """ # Gather input files inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "partitioned") pool = init_pool(min(len(inputs), processes)) printVerbose("Partitioning Files...") run_parallel([ PythonRunner(splitK, [ input_, "%s/%s" % (outdir, strip_ixes(input_)), chunksize, filetype ], {"exists": [input_]}) for input_ in inputs ], pool) printVerbose("Done partitioning files.") cleanup_pool(pool)
def cluster_vsearch(self, input_f, outdir, groupsfile, processes, idpct, extraargstring): """Clusters sequences using SWARM. :param input_f: A file or folder containing fasta files to cluster. :param outdir: The output directory results will be written to. :param groupsfile: A groups file or folder containinggroups files that describe the input. Note: if no groups file is supplied, then entries in the fasta file are assumed to be singleton sequences. :param idpct: Real number in the range (0,1] that specifies the minimum simmilarity threshold for clustering. e.g. .95 indicates that a candidate sequence 95% must be at least 95% simmilar to the seed sequence to be included in the cluster. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ # Grab the fasta file(s) to cluster inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "clustered") pool = init_pool(min(len(inputs), processes)) # RUN CLUSTERING # " --cluster_size %s -id %f --centroids %s --uc %s", run_parallel([ProgramRunner(ProgramRunnerCommands.CLUSTER_VSEARCH, [input_, float(idpct), "%s/%s_seeds.fasta" % (outdir, strip_ixes(input_)), "%s/%s_clustered_uc" % (outdir, strip_ixes(input_))], {"exists": [input_]}, extraargstring) for input_ in inputs], pool) # PARSE UC FILE TO GROUPS FILE printVerbose("Parsing the clustered uc files to groups files") clustered_uc_files = getInputFiles(outdir, "*_clustered_uc") debugPrintInputInfo(clustered_uc_files, "parsed to groups") run_parallel([PythonRunner(parseUCtoGroups, [input_, "%s/%s.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in clustered_uc_files], pool) # REMOVE COUNTS FROM CLUSTERING GROUPS FILE printVerbose("Cleaning the .groups file from clustering") # Grab the current groups file and the new clustered groups file (which needs to be cleaned) clustered_groups_files = getInputFiles(outdir, "*_clustered.groups") # Remove counts from the clustering groups files debugPrintInputInfo(clustered_groups_files, "cleaned") run_parallel([PythonRunner(removeCountsFromGroupsFile, [input_, "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in clustered_groups_files], pool) printVerbose("Done cleaning groups files.") # Collect the groups file from clustering with counts removed cleaned_clustered_groups_files = getInputFiles(outdir, "*_uncount.groups", ignore_empty_files=False) # Resolve the user specified names file if necessary final_groups_files = handle_groups_file_update(outdir, groupsfile, cleaned_clustered_groups_files) # Move the final groups file(s) to the groups dir groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(final_groups_files, groups_dir) # Move aux files to the aux dir aux_files = getInputFiles(outdir, "*", "*_seeds.fasta", ignore_empty_files=False) aux_dir = makeAuxDir(outdir) bulk_move_to_dir(aux_files, aux_dir) # Cleanup the pool cleanup_pool(pool)
def cluster_swarm(self, input_f, outdir, groupsfile, processes, extraargstring): """Clusters sequences using SWARM. :param input_f: A file or folder containing fasta files to cluster. :param outdir: The output directory results will be written to. :param groupsfile: A groups file or folder containing groups files that describe the input. Note: if no groups file is supplied, then entries in the fasta file are assumed to be singleton sequences. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ # Grab the fasta file(s) to cluster inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "clustered") pool = init_pool(min(len(inputs), processes)) # RUN CLUSTERING run_parallel([ ProgramRunner(ProgramRunnerCommands.CLUSTER_SWARM, [ input_, "%s/%s_clustered" % (outdir, strip_ixes(input_)), "%s/%s_clustered_uc" % (outdir, strip_ixes(input_)), "%s/%s_clustered_seeds" % (outdir, strip_ixes(input_)) ], {"exists": [input_]}, extraargstring) for input_ in inputs ], pool) # PARSE UC FILE TO GROUPS FILE printVerbose("Parsing the clustered uc files to groups files") clustered_uc_files = getInputFiles(outdir, "*_clustered_uc") debugPrintInputInfo(clustered_uc_files, "parsed to groups") run_parallel([ PythonRunner( parseUCtoGroups, [input_, "%s/%s.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in clustered_uc_files ], pool) printVerbose("Done parsing groups files.") # REMOVE COUNTS FROM CLUSTERING GROUPS FILE printVerbose("Cleaning the .groups file from clustering") # Grab the current groups file and the new clustered groups file (which needs to be cleaned) clustered_groups_files = getInputFiles(outdir, "*_clustered.groups") debugPrintInputInfo(clustered_groups_files, "cleaned") run_parallel([ PythonRunner(removeCountsFromGroupsFile, [ input_, "%s/%s_uncount.groups" % (outdir, strip_ixes(input_)) ], {"exists": [input_]}) for input_ in clustered_groups_files ], pool) printVerbose("Done cleaning groups files.") printVerbose("Capitalizing sequences") # Convert the seeds files to uppercase (swarm writes in lowercase) inputs = getInputFiles(outdir, "*_seeds") run_parallel([ PythonRunner(capitalize_seqs, [input_, "%s.fasta" % input_], {"exists": [input_]}) for input_ in inputs ], pool) printVerbose("Done capitalizing sequences") # Collect the groups file from clustering with counts removed cleaned_clustered_groups_files = getInputFiles( outdir, "*_uncount.groups", ignore_empty_files=False) # Resolve the user specified names file if necessary final_groups_files = handle_groups_file_update( outdir, groupsfile, cleaned_clustered_groups_files) # Move the final groups file(s) to the groups dir groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(final_groups_files, groups_dir) # Move aux files to the aux dir aux_files = getInputFiles(outdir, "*", "*_seeds.fasta", ignore_empty_files=False) aux_dir = makeAuxDir(outdir) bulk_move_to_dir(aux_files, aux_dir) # Cleanup the pool cleanup_pool(pool)
def dereplicate_vsearch(self, input_f, outdir, groupsfile, processes, stripcounts, extraargstring): """Dereplicates with vsearch. :param input_f: Filepath to the file or folder of files to dereplicate. :param outdir: Filepath to the output directory. :param groupsfile: A groups file to use as a reference for replicant counting. If no groups file is provided, input sequences are conidered singletons (regardless of their name-annotated dereplication count). :param processes: The number of processes to use to dereplicate the fileset. :param stripcounts: If True, strips the trailing dereplication counts from a file before dereplication. :param extraargstring: Advanced program parameter string. """ inputs = getInputFiles(input_f) pool = init_pool(min(len(inputs), processes)) # REMOVES COUNTS FROM SEQUENCE NAMES IN ORDER TO CLUSTER PROPERLY # strip counts if we need to. if stripcounts: printVerbose("Removing counts from sequence names...") debugPrintInputInfo(inputs, "renamed") run_parallel([ PythonRunner(removeCountsFromFastFile, [ input_, "%s/%s_uncount.fasta" % (outdir, strip_ixes(input_)), 'fasta' ], {"exists": input_}) for input_ in inputs ], pool) printVerbose("Done removing counts.") # Grab the cleaned files as input for the next step inputs = getInputFiles(outdir, "*_uncount.fasta") # DEREPLICATE debugPrintInputInfo(inputs, "dereplicated") printVerbose("Dereplicating...") run_parallel([ ProgramRunner(ProgramRunnerCommands.DEREP_VSEARCH, [ processes, input_, "%s/%s_derep.fasta" % (outdir, strip_ixes(input_)), "%s/%s_uc.out" % (outdir, strip_ixes(input_)) ], { "exists": [input_], "positive": [processes] }, extraargstring) for input_ in inputs ], pool) printVerbose("Done dereplicating") # LOG DEREPLICATED SEQUENCES INTO A .GROUPS FILE # generates a .groups file named _uc_parsed.out # python parseUCtoGroups.py uc.out uc_parsed.out input_ucs = getInputFiles(outdir, "*_uc.out") printVerbose("Generating a groups file from dereplication.") debugPrintInputInfo(inputs, "parsed (into a .groups file)") run_parallel([ PythonRunner( parseUCtoGroups, [input_, "%s/%s_derep.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in input_ucs ], pool) most_recent_groups_files = getInputFiles(outdir, "*_derep.groups", ignore_empty_files=False) # UPDATE THE MOST CURRENT GROUPS FILES WITH DEREPLICATION COUNTS if groupsfile is not None: # Grab the oldgroups file and the dereplicated groups file old_groups_files = getInputFiles(groupsfile) derep_groups_files = getInputFiles(outdir, "*_derep.groups") printVerbose("Updating .groups files with dereplicated data") printVerbose("%d Reference (old) groups files to be read:" % len(old_groups_files)) printVerbose(str(old_groups_files)) printVerbose("%d Dereplicated (new) groups files to be read:" % len(derep_groups_files)) printVerbose(str(derep_groups_files)) update_groups(old_groups_files, derep_groups_files, outdir, "dereplicated") most_recent_groups_files = getInputFiles(outdir, "dereplicated*", ignore_empty_files=False) printVerbose("Done updating .groups files.") if len(inputs) != len(most_recent_groups_files): print( "Error: Number of input fastas (%d) is not equal to the number ofgroups files (%d)." % (len(inputs), len(most_recent_groups_files))) exit() fasta_groups_pairs = zip(inputs, most_recent_groups_files) # ADD COUNT TO SEQUENCE NAMES AND SORT BY COUNT # python renameWithReplicantCounts.py # 8_macse_out/MACSEOUT_MERGED.fasta uc_parsed.out dereplicated_renamed.fasta printVerbose("Adding dereplication data to unique fasta") run_parallel([ PythonRunner(renameWithReplicantCounts, [ fasta, groups, "%s/%s_counts.fasta" % (outdir, strip_ixes(fasta)), 'fasta' ], {"exists": [fasta, groups]}) for fasta, groups in fasta_groups_pairs ], pool) printVerbose("Done adding data") aux_dir = makeAuxDir(outdir) groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(most_recent_groups_files, groups_dir) aux_files = getInputFiles(outdir, '*', "*_counts.fasta", ignore_empty_files=False) bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def cluster_crop(self, input_f, outdir, groupsfile, processes, blocksize, clustpct, maxmcmc, maxsm, rare, blockcount, extraargstring): """Clusters sequences using CROP. :param input_f: Filepath to the input fasta file to cluster. :param outdir: Filepath to the output directory. :param groupsfile: Filepath to the groups file to use as a reference for dereplication counting. :param blocksize: Size of blocks to be used for all rounds (if -b is specified, then -z will not affect the first round. For data set with different average sequence length, this parameter should \ be tuned such that it won't take too long for each block to do pariwise alignment. Hint \ for choosing z: z*L<150,000, where L is the average length of the sequences. :param clustpct: The minimum similarity threshold for clustering. Either 'g' for 95% or 's' for 97%. :param maxmcmc: This parameter specifies the number of iterations of MCMC. Default value is 2000. Increase \ this value to enhance accuracy (recommended value is at least 10*block size). :param maxsm: This parameter specifies the maximum number of 'split and merge' process to run. Max is 20. :param rare: The maximum cluster size allowed to be classified as 'rare'. Clusters are defined as either \ 'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \ clusters are mapped to the 'abundant' clusters. Finally, 'rare' clusters which cannot be \ mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \ considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \ believe your data is not too diverse to be handled, then r=0 will be the best choice. :param blockcount: The size of blocks in the first round of clustering. Hint of choosing -b: Each block in the \ first round should contain about 50 sequences. i.e. b=N/50, where N is the number of \ input sequences. Default: # input sequences / z. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ # Grab the fasta file(s) to cluster inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "clustered") pool = init_pool(min(len(inputs), processes)) # RUN CLUSTERING # crop -i %s -o %s -z %s -c %s -e %s -m %s%s run_parallel([ ProgramRunner(ProgramRunnerCommands.CLUSTER_CROP, [ input_, "%s/%s" % (outdir, strip_ixes(input_)), blocksize, clustpct, maxmcmc, maxsm, rare, blockcount ], {"exists": [input_]}, extraargstring) for input_ in inputs ], pool) # CLEAN THE OUTPUT GROUPS FILE printVerbose("Parsing the groups file from clustering") clustered_groups_files = getInputFiles(outdir, "*.cluster.list") debugPrintInputInfo(clustered_groups_files, "converted to groups files") run_parallel([ PythonRunner(parseCROPoutToGroups, [ input_, "%s/%s_uncount.groups" % (outdir, strip_ixes(input_)) ], {"exists": [input_]}) for input_ in clustered_groups_files ], pool) printVerbose("Done parsing groups file.") # Collect the groups file from clustering with counts removed cleaned_clustered_groups_files = getInputFiles( outdir, "*_uncount.groups", ignore_empty_files=False) # Resolve the user specified names file if necessary final_groups_files = handle_groups_file_update( outdir, groupsfile, cleaned_clustered_groups_files) # GATHER AUX FILES input_dir = getDirName(input_f) aux_files = cleaned_clustered_groups_files aux_files += getInputFiles(input_dir, "*.unique", ignore_empty_files=False) aux_files += getInputFiles(input_dir, "*.unique.list", ignore_empty_files=False) aux_files += getInputFiles(input_dir, "*.unique.TempCenters.Rare", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.cluster", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.cluster.list", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.log", ignore_empty_files=False) aux_files += getInputFiles(".", "LikelihoodRatio.txt", ignore_empty_files=False) # Move the final groups file(s) to the groups dir groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(final_groups_files, groups_dir) # Move aux files to the aux dir aux_dir = makeAuxDir(outdir) bulk_move_to_dir(aux_files, aux_dir) # Cleanup the pool cleanup_pool(pool)
def dereplicate_vsearch(self, input_f, outdir, groupsfile, processes, stripcounts, extraargstring): """Dereplicates with vsearch. :param input_f: Filepath to the file or folder of files to dereplicate. :param outdir: Filepath to the output directory. :param groupsfile: A groups file to use as a reference for replicant counting. If no groups file is provided, input sequences are conidered singletons (regardless of their name-annotated dereplication count). :param processes: The number of processes to use to dereplicate the fileset. :param stripcounts: If True, strips the trailing dereplication counts from a file before dereplication. :param extraargstring: Advanced program parameter string. """ inputs = getInputFiles(input_f) pool = init_pool(min(len(inputs), processes)) # REMOVES COUNTS FROM SEQUENCE NAMES IN ORDER TO CLUSTER PROPERLY # strip counts if we need to. if stripcounts: printVerbose("Removing counts from sequence names...") debugPrintInputInfo(inputs, "renamed") run_parallel([PythonRunner(removeCountsFromFastFile, [input_, "%s/%s_uncount.fasta" % (outdir, strip_ixes(input_)), 'fasta'], {"exists": input_}) for input_ in inputs], pool) printVerbose("Done removing counts.") # Grab the cleaned files as input for the next step inputs = getInputFiles(outdir, "*_uncount.fasta") # DEREPLICATE debugPrintInputInfo(inputs, "dereplicated") printVerbose("Dereplicating...") run_parallel([ProgramRunner(ProgramRunnerCommands.DEREP_VSEARCH, [processes, input_, "%s/%s_derep.fasta" % (outdir, strip_ixes(input_)), "%s/%s_uc.out" % (outdir, strip_ixes(input_))], {"exists": [input_], "positive": [processes]}, extraargstring) for input_ in inputs], pool) printVerbose("Done dereplicating") # LOG DEREPLICATED SEQUENCES INTO A .GROUPS FILE # generates a .groups file named _uc_parsed.out # python parseUCtoGroups.py uc.out uc_parsed.out input_ucs = getInputFiles(outdir, "*_uc.out") printVerbose("Generating a groups file from dereplication.") debugPrintInputInfo(inputs, "parsed (into a .groups file)") run_parallel([PythonRunner(parseUCtoGroups, [input_, "%s/%s_derep.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in input_ucs], pool) most_recent_groups_files = getInputFiles(outdir, "*_derep.groups", ignore_empty_files=False) # UPDATE THE MOST CURRENT GROUPS FILES WITH DEREPLICATION COUNTS if groupsfile is not None: # Grab the oldgroups file and the dereplicated groups file old_groups_files = getInputFiles(groupsfile) derep_groups_files = getInputFiles(outdir, "*_derep.groups") printVerbose("Updating .groups files with dereplicated data") printVerbose("%d Reference (old) groups files to be read:" % len(old_groups_files)) printVerbose(str(old_groups_files)) printVerbose("%d Dereplicated (new) groups files to be read:" % len(derep_groups_files)) printVerbose(str(derep_groups_files)) update_groups(old_groups_files, derep_groups_files, outdir, "dereplicated") most_recent_groups_files = getInputFiles(outdir, "dereplicated*", ignore_empty_files=False) printVerbose("Done updating .groups files.") if len(inputs) != len(most_recent_groups_files): print ("Error: Number of input fastas (%d) is not equal to the number ofgroups files (%d)." % (len(inputs), len(most_recent_groups_files))) exit() fasta_groups_pairs = zip(inputs, most_recent_groups_files) # ADD COUNT TO SEQUENCE NAMES AND SORT BY COUNT # python renameWithReplicantCounts.py # 8_macse_out/MACSEOUT_MERGED.fasta uc_parsed.out dereplicated_renamed.fasta printVerbose("Adding dereplication data to unique fasta") run_parallel([PythonRunner(renameWithReplicantCounts, [fasta, groups, "%s/%s_counts.fasta" % (outdir, strip_ixes(fasta)), 'fasta'], {"exists": [fasta, groups]}) for fasta, groups in fasta_groups_pairs], pool) printVerbose("Done adding data") aux_dir = makeAuxDir(outdir) groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(most_recent_groups_files, groups_dir) aux_files = getInputFiles(outdir, '*', "*_counts.fasta", ignore_empty_files=False) bulk_move_to_dir(aux_files, aux_dir) cleanup_pool(pool)
def execute_program(self): args = self.args input_file = getInputFiles(args.input_f)[0] output_file = "%s/%s.png" % (args.outdir, strip_ixes(input_file)) data_frame = subset_dataframe(args.input_f, args) self.visualize_otu_heatmap(data_frame, output_file)
def execute_program(self): args = self.args input_file = getInputFiles(args.input_f)[0] output_file = "%s/%s.png" % (args.outdir, strip_ixes(input_file)) data_frame = subset_dataframe(args.input_f, args) self.visualize_otu_sample_comp(data_frame, output_file)
def cluster_crop(self, input_f, outdir, groupsfile, processes, blocksize, clustpct, maxmcmc, maxsm, rare, blockcount, extraargstring): """Clusters sequences using CROP. :param input_f: Filepath to the input fasta file to cluster. :param outdir: Filepath to the output directory. :param groupsfile: Filepath to the groups file to use as a reference for dereplication counting. :param blocksize: Size of blocks to be used for all rounds (if -b is specified, then -z will not affect the first round. For data set with different average sequence length, this parameter should \ be tuned such that it won't take too long for each block to do pariwise alignment. Hint \ for choosing z: z*L<150,000, where L is the average length of the sequences. :param clustpct: The minimum similarity threshold for clustering. Either 'g' for 95% or 's' for 97%. :param maxmcmc: This parameter specifies the number of iterations of MCMC. Default value is 2000. Increase \ this value to enhance accuracy (recommended value is at least 10*block size). :param maxsm: This parameter specifies the maximum number of 'split and merge' process to run. Max is 20. :param rare: The maximum cluster size allowed to be classified as 'rare'. Clusters are defined as either \ 'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \ clusters are mapped to the 'abundant' clusters. Finally, 'rare' clusters which cannot be \ mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \ considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \ believe your data is not too diverse to be handled, then r=0 will be the best choice. :param blockcount: The size of blocks in the first round of clustering. Hint of choosing -b: Each block in the \ first round should contain about 50 sequences. i.e. b=N/50, where N is the number of \ input sequences. Default: # input sequences / z. :param processes: The maximum number of processes to use. :param extraargstring: Advanced program parameter string. """ # Grab the fasta file(s) to cluster inputs = getInputFiles(input_f) debugPrintInputInfo(inputs, "clustered") pool = init_pool(min(len(inputs), processes)) # RUN CLUSTERING # crop -i %s -o %s -z %s -c %s -e %s -m %s%s run_parallel([ProgramRunner(ProgramRunnerCommands.CLUSTER_CROP, [input_, "%s/%s" % (outdir, strip_ixes(input_)), blocksize, clustpct, maxmcmc, maxsm, rare, blockcount], {"exists": [input_]}, extraargstring) for input_ in inputs], pool) # CLEAN THE OUTPUT GROUPS FILE printVerbose("Parsing the groups file from clustering") clustered_groups_files = getInputFiles(outdir, "*.cluster.list") debugPrintInputInfo(clustered_groups_files, "converted to groups files") run_parallel([PythonRunner(parseCROPoutToGroups, [input_, "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))], {"exists": [input_]}) for input_ in clustered_groups_files], pool) printVerbose("Done parsing groups file.") # Collect the groups file from clustering with counts removed cleaned_clustered_groups_files = getInputFiles(outdir, "*_uncount.groups", ignore_empty_files=False) # Resolve the user specified names file if necessary final_groups_files = handle_groups_file_update(outdir, groupsfile, cleaned_clustered_groups_files) # GATHER AUX FILES input_dir = getDirName(input_f) aux_files = cleaned_clustered_groups_files aux_files += getInputFiles(input_dir, "*.unique", ignore_empty_files=False) aux_files += getInputFiles(input_dir, "*.unique.list", ignore_empty_files=False) aux_files += getInputFiles(input_dir, "*.unique.TempCenters.Rare", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.cluster", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.cluster.list", ignore_empty_files=False) aux_files += getInputFiles(outdir, "*.log", ignore_empty_files=False) aux_files += getInputFiles(".", "LikelihoodRatio.txt", ignore_empty_files=False) # Move the final groups file(s) to the groups dir groups_dir = makeDirOrdie("%s_groups_files" % outdir) bulk_move_to_dir(final_groups_files, groups_dir) # Move aux files to the aux dir aux_dir = makeAuxDir(outdir) bulk_move_to_dir(aux_files, aux_dir) # Cleanup the pool cleanup_pool(pool)