Python strip_ixes примеры, classes.Helpers.strip_ixes Python примеры использования

Пример #1

0

Показать файл

Файл: Query_Helpers.py Проект: pythseq/Chewbacca

def query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string,
                  extraargstring, pool):
    """Runs a VSEARCH alignment on pairs of query/reference sequences.

    :param inputs: A list of pairs of (filepaths to) query_fastas and the refrence fastas to compare them to.
    :param outdir: Filepath to the directory where the alignment result should be written.
    :param aln_user_string: An optional string of commandline parameters passed to the VSEARCH program.
    :param simmilarity: The minimum simmilarity percentage (between reference and query sequences), \
                            as a decimal between 0 and 1), required for a positive  match.
    :param processes: The number of processes to use in the identification process.
    :param extraargstring: Advanced program parameter string.
    :param pool: A fully initalized multiprocessing.Pool object.
    """
    printVerbose("Aligning against reference sequences...")
    #     # vsearch --usearch_global %s seeds.pick.fasta  --db ../data/BiocodePASSED_SAP.txt --id 0.9 \
    # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt
    run_parallel([
        ProgramRunner(ProgramRunnerCommands.ALIGN_VSEARCH, [
            processes, query_fasta, ref_fasta, simmilarity,
            "%s/%s.out" % (outdir, strip_ixes(query_fasta)),
            "%s/%s.alnout" % (outdir, strip_ixes(query_fasta)), aln_user_string
        ], {
            "exists": [query_fasta, ref_fasta],
            "positive": [processes]
        }, extraargstring) for query_fasta, ref_fasta in inputs
    ], pool)
    printVerbose("Done aligning.")
    return

Пример #2

0

Показать файл

    def query_fasta_vsearch(self, input_f, referencefasta, taxinfo, outdir, processes, simmilarity, coverage,
                            extraargstring):
        """Compare reference sequences to the fasta-formatted query sequences, using global pairwise alignment.

        :param input_f:  Filepath to a file or folder of files to identify.
        :param outdir: Filepath to the output directory.
        :param referencefasta: Filepath to a file or folder of files to use as a reference.
        :param taxinfo:  Filepath to a file containing taxonomic info correlated with the referencefasta.
        :param simmilarity: The % simmilarity between a query and reference sequence required for positive
                                identification.
        :param coverage: The % coverage of matching regions between a query and reference sequence required for positive
                            identification.
        :param processes: The number of processes to use in the identification process.
        :param extraargstring: Advanced program parameter string.
        """
        # vsearch --usearch_global %s seeds.pick.fasta  --db ../data/BiocodePASSED_SAP.txt --id 0.9 \
        #       --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt

        # expecting a fasta to annotate
        query_fastas = getInputFiles(input_f)
        debugPrintInputInfo(query_fastas, "queried for identification.")
        ref_fastas = getInputFiles(referencefasta)
        debugPrintInputInfo(ref_fastas, "referenced for sequence identification.")
        tax_info_files = getInputFiles(taxinfo)
        debugPrintInputInfo(tax_info_files, "referenced for taxanomic names.")

        # make sure the number of reference fasta files is the same as the number of tax_info files
        if len(tax_info_files) != len(ref_fastas):
            print "Error: The number of reference fastas and taxonomic mapping files is not the same.  There must be \
                    one taxonomic mapping file for each reference fasta."
            return
        ref_data_pairs = zip(ref_fastas, tax_info_files)
        inputs = [x for x in product(query_fastas, ref_fastas)]
        aln_user_string = ""
        pool = init_pool(min(len(inputs), processes))

        # VSEARCH ALIGNMENT
        query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string, extraargstring, pool)

        printVerbose("Parsing output...")
        # Parse the alignment results and put those that pass the criterion (97 similarity, 85 coverage) in
        # parsed_BIOCODE.out.  Parameters can be changed and this command can be rerun as many times as necessary
        #
        # parseVSearchOutputAgainstFasta(vsearch_outfile, taxInfo, output_file, min_simmilarity, min_coverage):
        inputs = [x for x in product(query_fastas, ref_data_pairs)]
        debugPrintInputInfo(inputs, "queryied against paired refereces.")
        run_parallel([PythonRunner(parseVSearchOutputAgainstFasta,
                                   ["%s/%s.out" % (outdir, strip_ixes(query)), tax_info,
                                    "%s/%s.tax" % (outdir, strip_ixes(query)), simmilarity, coverage],
                                   {"exists": [query, ref_fasta, tax_info]})
                      for query, (ref_fasta, tax_info) in inputs], pool)
        printVerbose("\nDone parsing...")

        # Gather and move auxillary files
        aux_files = getInputFiles(outdir, "*", "*.tax", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))

        cleanup_pool(pool)

Пример #3

0

Показать файл

    def query_fasta_db_vsearch(self, input_f, outdir, ref_fasta, ref_db,
                               simmilarity, coverage, processes,
                               extraargstring):
        """Compare reference sequences to the fasta-formatted query sequences, using global pairwise alignment.

        :param input_f:  Filepath to a file or folder of files to identify.
        :param outdir: Filepath to the output directory.
        :param ref_fasta: Filepath to the curated fasta file to use as a reference.
        :param ref_db: Filepath to the curated fasta file to use as a reference.
        :param simmilarity:"Minimum % simmilarity (decimal between 0 and 1) between query and reference sequences
                            required for positive identification.
        :param coverage:Minimum % coverage (decimal between 0 and 1) required query and reference sequences required
                            for positive identification.
        :param processes: The number of processes to use in the identification process.
        :param extraargstring: Advanced program parameter string.
        """
        # blast6 output format http://www.drive5.com/usearch/manual/blast6out.html
        aln_user_string = "--userfields query+target+id+alnlen+qcov"
        # coi_fasta = os.path.expanduser("~/ARMS/refs/COI.fasta")
        # ncbi_db_string = os.path.expanduser("~/ARMS/refs/ncbi.db")
        coi_fasta = ref_fasta
        ncbi_db_string = ref_db

        query_fastas = getInputFiles(input_f)
        debugPrintInputInfo(query_fastas, "queried against the DB.")
        inputs = [x for x in product(query_fastas, [coi_fasta])]
        pool = init_pool(min(len(query_fastas), processes))

        # VSEARCH ALIGNMENT
        query_vsearch(inputs, outdir, simmilarity, processes, aln_user_string,
                      extraargstring, pool)

        printVerbose("Parsing output...")
        # Parse the alignment results and put those that pass the criterion (97 similarity, 85 coverage) in
        # parsed_BIOCODE.out.  Parameters can be changed and this command can be rerun as many times as necessary
        #
        # parseVSearchOutputAgainstNCBI(vsearch_out, ncbi_db, min_coverage, min_similarity)> parsed_nt.out
        run_parallel([
            PythonRunner(parseVSearchOutputAgainstNCBI, [
                "%s/%s.out" % (outdir, strip_ixes(query)), ncbi_db_string,
                "%s/%s.tax" %
                (outdir, strip_ixes(query)), simmilarity, coverage
            ], {"exits": [query, ncbi_db_string]}) for query in query_fastas
        ], pool)
        printVerbose("Done processing.")

        # Gather and move auxillary files
        aux_files = getInputFiles(outdir,
                                  "*",
                                  "*.tax",
                                  ignore_empty_files=False)
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))

        cleanup_pool(pool)

Пример #4

0

Показать файл

    def clean_trim_adapters_flexbar(self, input_f, adapters, adaptersrc,
                                    outdir, allowedns, processes,
                                    extraargstring):
        """Use flexbar to trim adapters and barcodes from sequences.  By default, Flexbar does not allow any 'N' \
            characters in SEQUENCE, and will toss any sequences that do contain 'N'.  To avoid this, use the -u or \
            --allowedns flags to specify the maximum number of 'N's to allow

        :param input_f: Filepath to input file or folder.
        :param adapters: Filepath to a list of adapters.
        :param adaptersrc: Filepath to a list of reverse-complemented adapters.
        :param outdir: Filepath to the output directory.
        :param allowedns: Non-negative integer value indicating the maximum number of 'N's to tolerate in a sequence.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        debugPrintInputInfo(inputs, "trim adapters from")
        # "flexbar":  "flexbar -r \"%s\" -t \"%s\" -ae \"%s\" -a \"%s\"",
        printVerbose("Trimming barcodes and adapters with flexbar")
        temp_file_name_template = "%s/temp_%s"
        debarcoded_file_name_template = "%s/%s_debarcoded"
        # Trim adapters from the left
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [
                input_file, temp_file_name_template %
                (outdir, strip_ixes(input_file)), "LEFT", adapters, allowedns
            ], {"exists": [input_file, adapters]}, extraargstring)
            for input_file in inputs
        ], pool)

        temp_files = getInputFiles(outdir, "temp_*")
        debugPrintInputInfo(temp_files, "trim adapters from")

        # Trim the reverse complemented adapters from the right
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR, [
                input_file, debarcoded_file_name_template %
                (outdir, strip_ixes(input_file)[5:]), "RIGHT", adaptersrc,
                allowedns
            ], {"exists": [input_file, adaptersrc]}, extraargstring)
            for input_file in temp_files
        ], pool)
        printVerbose("Done Trimming sequences.")

        # Move temp files
        aux_files = getInputFiles(outdir, "temp_*", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)

Пример #5

0

Показать файл

Файл: Rename_Program_Chewbacca.py Проект: gregorylburgess/Chewbacca

    def rename_chewbacca(self, input_f, outdir, filetype, clip, processes):
        """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where
            <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes.

        :param input_f: Filepath to an input file or folder to rename.
        :param outdir: Filepath to the output directory.
        :param filetype: Either 'fasta' or 'fastq'.
        :param clip: If True, remove dereplication counts from sequence names before renaming.
        :param processes: The maximum number of processes to use.
        """

        # Gather input files
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "rename")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Renaming sequences...")
        # Run serialRename in run_parallel
        run_parallel([PythonRunner(serialRename,
                                   [input_,
                                    "%s/%s_renamed%s" % (outdir, strip_ixes(input_), os.path.splitext(input_)[1]),
                                    filetype, clip], {"exists": [input_]})
                      for input_ in inputs], pool)
        printVerbose("Done renaming sequences...")

        samples_dir = makeDirOrdie("%s_samples" % outdir)
        samples_files = getInputFiles(outdir, "*.samples", ignore_empty_files=False)
        bulk_move_to_dir(samples_files, samples_dir)

        aux_dir = makeAuxDir(outdir)
        aux_files = getInputFiles(outdir, "*.mapping", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)

        cleanup_pool(pool)

Пример #6

0

Показать файл

Файл: Clean_Quality_Program_Trimmomatic.py Проект: gregorylburgess/Chewbacca

    def clean_quality_trimmomatic(self, input_f, outdir, window_size, quality, min_len, processes, extraargstring):
        """Uses a sliding window to identify and trim away areas of low quality.

        :param input_f: Filepath to input file or folder.
        :param outdir: Filepath to the output directory.
        :param window_size: Width of the sliding window. (Number of consecutive base-pairs to average for quality \
                            analysis).
        :param quality: Minimum quality allowed.  Sections with lower average quality than this will be dropped.
        :param min_len: Minimum allowed length for TRIMMED sequences.  (i.e. if a sequence is too short after trimming,
                        its dropped.)
        :param processes: Number of processes to use to clean the input fileset.
        """
        # "trimomatic":       "java -jar ~/ARMS/programs/Trimmomatic-0.33/trimmomatic-0.33.jar SE \
        # -%phred %input %output SLIDINGWINDOW:%windowsize:%minAvgQuality MINLEN:%minLen"

        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clean")
        pool = init_pool(min(len(inputs), processes))

        printVerbose("Cleaning sequences with Trimmomatic...")
        run_parallel([ProgramRunner(ProgramRunnerCommands.CLEAN_TRIMMOMATIC,
                                    [input_, "%s/%s_cleaned.fastq" % (outdir, strip_ixes(input_)), window_size, quality,
                                     min_len],
                                    {"exists": [outdir, input_], "positive": [window_size, quality, min_len]},
                                    extraargstring)
                      for input_ in inputs], pool)
        printVerbose("Done cleaning sequences.")
        cleanup_pool(pool)

Пример #7

0

Показать файл

Файл: Clean_Quality_Program_Trimmomatic.py Проект: pythseq/Chewbacca

    def clean_quality_trimmomatic(self, input_f, outdir, window_size, quality,
                                  min_len, processes, extraargstring):
        """Uses a sliding window to identify and trim away areas of low quality.

        :param input_f: Filepath to input file or folder.
        :param outdir: Filepath to the output directory.
        :param window_size: Width of the sliding window. (Number of consecutive base-pairs to average for quality \
                            analysis).
        :param quality: Minimum quality allowed.  Sections with lower average quality than this will be dropped.
        :param min_len: Minimum allowed length for TRIMMED sequences.  (i.e. if a sequence is too short after trimming,
                        its dropped.)
        :param processes: Number of processes to use to clean the input fileset.
        """
        # "trimomatic":       "java -jar ~/ARMS/programs/Trimmomatic-0.33/trimmomatic-0.33.jar SE \
        # -%phred %input %output SLIDINGWINDOW:%windowsize:%minAvgQuality MINLEN:%minLen"

        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clean")
        pool = init_pool(min(len(inputs), processes))

        printVerbose("Cleaning sequences with Trimmomatic...")
        run_parallel([
            ProgramRunner(
                ProgramRunnerCommands.CLEAN_TRIMMOMATIC, [
                    input_,
                    "%s/%s_cleaned.fastq" %
                    (outdir, strip_ixes(input_)), window_size, quality, min_len
                ], {
                    "exists": [outdir, input_],
                    "positive": [window_size, quality, min_len]
                }, extraargstring) for input_ in inputs
        ], pool)
        printVerbose("Done cleaning sequences.")
        cleanup_pool(pool)

Пример #8

0

Показать файл

    def preclean_bayeshammer(self, input_f, input_r, outdir, processes,
                             bayesthreads, extraargstring):
        """Assembles reads from two (left and right) fastq files/directories.

        :param input_f: File path to file or folder of left reads to clean.
        :param input_r: File path to file or folder of right reads to clean.
        :param outdir: Filepath to output directory.
        :param bayesthreads: The number of threads per process to use.
        :param processes: The maximum number of processes to use.
        :param kmerlen: The kmer length to use.  Default: 16.
        :param extraargstring: Advanced program parameter string.
        """
        # Collect input files, and validate that they match
        inputs = validate_paired_fastq_reads(input_f, input_r)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("\tPrecleaning %s reads with Spades-Baye's Hammer..." %
                     len(inputs))
        debugPrintInputInfo(inputs, "preclean/fix.")

        run_parallel([
            ProgramRunner(ProgramRunnerCommands.PRECLEAN_SPADES,
                          [forwards, reverse, outdir, bayesthreads], {
                              "exists": [forwards, reverse],
                              "positive": [bayesthreads]
                          }, extraargstring) for forwards, reverse in inputs
        ], pool)
        printVerbose("Done cleaning reads.")

        # Grab all the auxillary files (everything not containing ".assembled."
        # aux_files = getInputFiles(outdir, "*", "*.assembled.*", ignore_empty_files=False)
        # make aux dir for extraneous files and move them there
        # bulk_move_to_dir(aux_files, makeAuxDir(outdir))

        # Select output files
        aux_files = getInputFiles(outdir, "*", ignore_empty_files=False)
        corrected_dir = "%s/corrected" % outdir
        bulk_move_to_dir(getInputFiles(corrected_dir, "*"), outdir)
        aux_files += getInputFiles(outdir,
                                   "*unpaired*",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "configs", ignore_empty_files=False)

        # Gather aux files
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Rename output files
        output_files = getInputFiles(outdir, "*", "corrected_*")
        for out_file in output_files:
            move(out_file,
                 "%s/%s_corrected.fastq" % (outdir, strip_ixes(out_file)))

        # move the last minute log file
        try:
            move("%s/corrected_corrected.fastq" % outdir,
                 "%s/corrected_corrected.fastq" % aux_dir)
        except:
            pass
        cleanup_pool(pool)

Пример #9

0

Показать файл

    def align_clean_macse(self, input_f, ref, samplesdir, outdir, processes, extraargstring=""):
        """Removes non-nucleotide characters in MACSE aligned sequences for all fasta files in the samples directory
            (the samplesDir argument).

        :param input_f: File path to file or folder of files to clean.
        :param samplesdir: Filepath to the original, unaligned input files (the inputs to the macse aligner).
        :param ref: Filepath to the reference file used to align the input files.
        :param outdir: Filepath to the directory to write outputs to.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        # "macse_format":     "java -jar " + programPaths["MACSE"] + "  -prog exportAlignment -align \"%s\" \
        #                           -charForRemainingFS - -gc_def 5 -out_AA \"%s\" -out_NT \"%s\" -statFile \"%s\""

        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        printVerbose("\t %s Processing MACSE alignments")
        samples_list = getInputFiles(samplesdir)
        run_parallel([ProgramRunner(ProgramRunnerCommands.MACSE_FORMAT,
                                    ["%s/%s_NT" % (input_f, getFileName(sample)),
                                     "%s/%s_AA_macse.fasta" % (outdir, getFileName(sample)),
                                     "%s/%s_NT_macse.fasta" % (outdir, getFileName(sample)),
                                     "%s/%s_macse.csv" % (outdir, getFileName(sample))],

                                    {"exists": ["%s/%s_NT" % (input_f, getFileName(sample))]}, extraargstring)
                      for sample in samples_list], pool)
        printVerbose("\tCleaning MACSE alignments")

        printVerbose("Processing %s samples..." % len(samples_list))
        nt_macse_outs = ["%s/%s_NT_macse.fasta" % (outdir, strip_ixes(sample)) for sample in samples_list]

        # Clean the alignments
        from classes.PythonRunner import PythonRunner
        run_parallel([PythonRunner(remove_refs_from_macse_out, [input_, ref,
                                   "%s/%s" % (outdir, "%s_cleaned.fasta" % strip_ixes(input_))],
                                   {"exists": [input_, ref]})
                      for input_ in nt_macse_outs], pool)

        # Cat the cleaned alignments
        cleaned_alignments = getInputFiles(outdir, "*_cleaned.fasta")
        merge_files(cleaned_alignments, "%s/MACSE_OUT_MERGED.fasta" % outdir)

        aux_dir = makeAuxDir(outdir)
        aux_files = getInputFiles(outdir, "*", "MACSE_OUT_MERGED.fasta", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)
        cleanup_pool(pool)

Пример #10

0

Показать файл

Файл: nast.py Проект: pythseq/Chewbacca

def get_best_hits_from_vsearch(input_fna, ref_fna, outdir, id_pct=.7):
    """Calls vsearch with an input fasta, and returns a dictionary mapping each sequence to its best hit. (subject to
        the ID threshold (70%) in vsearch (See ProgramRunnerCommands.ALIGN_VSEARCH).

    :param input_fna: string.  Filepath to the input fna fasta file.
    :param ref_fna: string. Filepath to the reference fna fasta file.
    :param outdir: string. Filepath to the output directory for the hits file.
    :return: {string:string} A dictionary mapping input sequence names to the best hit in the reference DB.
    """
    def best_hits_from_vsearch(v_search_output):
        best_hits = {}
        for line in open(v_search_output, 'r'):
            data = line.split("\t")
            query_name = data[0].rstrip()
            if best_hits.has_key(query_name):
                if float(best_hits[query_name][2].rstrip()) < float(
                        data[2].rstrip()):
                    best_hits[query_name] = data
            else:
                best_hits[query_name] = data
        return best_hits

    threads = 1
    pool = init_pool(threads)
    #printVerbose.VERBOSE = True
    print "calling vsearch"
    processes = 1
    aln_user_string = ""
    extraargstring = ""

    printVerbose("Aligning against reference sequences...")
    #     # vsearch --usearch_global %s seeds.pick.fasta  --db ../data/BiocodePASSED_SAP.txt --id 0.9 \
    # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt
    ProgramRunner(ProgramRunnerCommands.ALIGN_VSEARCH, [
        processes, input_fna, ref_fna, id_pct,
        "%s/%s.out" % (outdir, strip_ixes(input_fna)),
        "%s/%s.alnout" % (outdir, strip_ixes(input_fna)), aln_user_string
    ], {
        "exists": [input_fna, ref_fna],
        "positive": [processes]
    }, extraargstring).run()

    vsearch_output = "%s/%s.out" % (outdir, strip_ixes(input_fna))

    # Choose the best hit
    return best_hits_from_vsearch(vsearch_output)

Пример #11

0

Показать файл

def serialRename(input_file, output_fasta_filepath, file_type, clip=True):
    """Takes in a fasta file and outputs a new fasta with the sequences renamed.  Renaming convention is x.y.z<n> for
        x.y.z.fasta, where n is an integer in the range [0:n] where n is the position of the sequence in the input_file.
        Also writes a groups file, linking each sequence to its parent sample.
        e.g. The sequences in SiteX_SampleA.fasta are renamed:
                SiteX_SampleA_0, SiteX_SampleA_1, SiteX_SampleA_2, etc.
    :param input_file: Input fasta or fastq file.
    :param output_fasta_filepath:     Filepath for the output .samples file.
    :param file_type: "fasta" or "fastq"
    :param clip: True if filenames contain file_ID#s.  Will clip the IDs before renaming to get proper sequence names.
    """

    samples_file = "%s/%s_renamed.samples" % (
        os.path.dirname(output_fasta_filepath), strip_ixes(input_file))
    name_map_file = "%s/%s_renamed.mapping" % (
        os.path.dirname(output_fasta_filepath), strip_ixes(input_file))
    seq_prefix = strip_ixes(input_file)
    i = 0
    renamed_fasta = BufferedSeqWriter(output_fasta_filepath, file_type)
    mapping_file_output = BufferedFileWriter(name_map_file)
    samples_file_output = BufferedFileWriter(samples_file)

    for s in SeqIO.parse(input_file, file_type):
        i += 1

        # Store the old_name new_name mapping
        old_id = s.id
        s.id = "%s_ID%s" % (seq_prefix, i)
        mapping_file_output.write("%s\t%s" % (old_id, s.id))

        # Store the sequence-sample map
        if clip:
            sample_name = clip_count(seq_prefix)
        else:
            sample_name = seq_prefix
        samples_file_output.write("%s\t%s" % (s.id, sample_name))

        # Store the renamed sequence
        s.description = ""
        renamed_fasta.write(s)

    renamed_fasta.flush()
    mapping_file_output.flush()
    samples_file_output.flush()

Пример #12

0

Показать файл

Файл: nast.py Проект: gregorylburgess/Chewbacca

def get_best_hits_from_vsearch(input_fna, ref_fna, outdir, id_pct=.7):
    """Calls vsearch with an input fasta, and returns a dictionary mapping each sequence to its best hit. (subject to
        the ID threshold (70%) in vsearch (See ProgramRunnerCommands.ALIGN_VSEARCH).

    :param input_fna: string.  Filepath to the input fna fasta file.
    :param ref_fna: string. Filepath to the reference fna fasta file.
    :param outdir: string. Filepath to the output directory for the hits file.
    :return: {string:string} A dictionary mapping input sequence names to the best hit in the reference DB.
    """
    def best_hits_from_vsearch(v_search_output):
        best_hits = {}
        for line in open(v_search_output, 'r'):
            data = line.split("\t")
            query_name = data[0].rstrip()
            if best_hits.has_key(query_name):
                if float(best_hits[query_name][2].rstrip()) < float(data[2].rstrip()):
                    best_hits[query_name] = data
            else:
                best_hits[query_name] = data
        return best_hits

    threads = 1
    pool = init_pool(threads)
    #printVerbose.VERBOSE = True
    print "calling vsearch"
    processes=1
    aln_user_string=""
    extraargstring=""


    printVerbose("Aligning against reference sequences...")
    #     # vsearch --usearch_global %s seeds.pick.fasta  --db ../data/BiocodePASSED_SAP.txt --id 0.9 \
    # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt
    ProgramRunner(ProgramRunnerCommands.ALIGN_VSEARCH,
                            [processes, input_fna, ref_fna, id_pct, "%s/%s.out" % (outdir, strip_ixes(input_fna)),
                             "%s/%s.alnout" % (outdir, strip_ixes(input_fna)), aln_user_string],
                            {"exists": [input_fna, ref_fna], "positive": [processes]},
                            extraargstring).run()

    vsearch_output = "%s/%s.out" % (outdir, strip_ixes(input_fna))

    # Choose the best hit
    return best_hits_from_vsearch(vsearch_output)

Пример #13

0

Показать файл

Файл: msa.py Проект: pythseq/Chewbacca

def get_best_hits_from_vsearch(input_fna, ref_fna, outdir):

    def best_hits_from_vsearch(v_search_output):
        best_hits = {}
        for line in open(v_search_output, 'r'):
            data = line.split("\t")
            query_name = data[0].rstrip()
            if best_hits.has_key(query_name):
                if float(best_hits[query_name][2].rstrip()) < float(data[2].rstrip()):
                    best_hits[query_name] = data
            else:
                best_hits[query_name] = data
        return best_hits


    threads = 1
    pool = init_pool(threads)
    #printVerbose.VERBOSE = True
    print "calling vsearch"
    # Search for good hits
    inputs = [(input_fna, ref_fna)]
    processes=1
    aln_user_string=""
    extraargstring=""


    printVerbose("Aligning against reference sequences...")
    #     # vsearch --usearch_global %s seeds.pick.fasta  --db ../data/BiocodePASSED_SAP.txt --id 0.9 \
    # --userfields query+target+id+alnlen+qcov --userout %sout --alnout %s alnout.txt
    ProgramRunner(ProgramRunnerCommands.ALIGN_VSEARCH,
                            [processes, input_fna, ref_fna, "%s/%s.out" % (outdir, strip_ixes(input_fna)),
                             "%s/%s.alnout" % (outdir, strip_ixes(input_fna)), aln_user_string],
                            {"exists": [input_fna, ref_fna], "positive": [processes]},
                            extraargstring).run()


    print "cleaning up."
    vsearch_output = "%s/%s.out" % (outdir, strip_ixes(input_fna))

    # Choose the best hit
    return best_hits_from_vsearch(vsearch_output)

Пример #14

0

Показать файл

Файл: Rename_Program_Chewbacca.py Проект: gregorylburgess/Chewbacca

def serialRename(input_file, output_fasta_filepath, file_type, clip=True):
    """Takes in a fasta file and outputs a new fasta with the sequences renamed.  Renaming convention is x.y.z<n> for
        x.y.z.fasta, where n is an integer in the range [0:n] where n is the position of the sequence in the input_file.
        Also writes a groups file, linking each sequence to its parent sample.
        e.g. The sequences in SiteX_SampleA.fasta are renamed:
                SiteX_SampleA_0, SiteX_SampleA_1, SiteX_SampleA_2, etc.
    :param input_file: Input fasta or fastq file.
    :param output_fasta_filepath:     Filepath for the output .samples file.
    :param file_type: "fasta" or "fastq"
    :param clip: True if filenames contain file_ID#s.  Will clip the IDs before renaming to get proper sequence names.
    """

    samples_file = "%s/%s_renamed.samples" % (os.path.dirname(output_fasta_filepath), strip_ixes(input_file))
    name_map_file = "%s/%s_renamed.mapping" % (os.path.dirname(output_fasta_filepath), strip_ixes(input_file))
    seq_prefix = strip_ixes(input_file)
    i = 0
    renamed_fasta = BufferedSeqWriter(output_fasta_filepath, file_type)
    mapping_file_output = BufferedFileWriter(name_map_file)
    samples_file_output = BufferedFileWriter(samples_file)

    for s in SeqIO.parse(input_file, file_type):
        i += 1

        # Store the old_name new_name mapping
        old_id = s.id
        s.id = "%s_ID%s" % (seq_prefix, i)
        mapping_file_output.write("%s\t%s" % (old_id, s.id))

        # Store the sequence-sample map
        if clip:
            sample_name = clip_count(seq_prefix)
        else:
            sample_name = seq_prefix
        samples_file_output.write("%s\t%s" % (s.id, sample_name))

        # Store the renamed sequence
        s.description = ""
        renamed_fasta.write(s)

    renamed_fasta.flush()
    mapping_file_output.flush()
    samples_file_output.flush()

Пример #15

0

Показать файл

Файл: Clean_Adapters_Program_Flexbar.py Проект: gregorylburgess/Chewbacca

    def clean_trim_adapters_flexbar(self, input_f, adapters, adaptersrc, outdir, allowedns, processes, extraargstring):
        """Use flexbar to trim adapters and barcodes from sequences.  By default, Flexbar does not allow any 'N' \
            characters in SEQUENCE, and will toss any sequences that do contain 'N'.  To avoid this, use the -u or \
            --allowedns flags to specify the maximum number of 'N's to allow

        :param input_f: Filepath to input file or folder.
        :param adapters: Filepath to a list of adapters.
        :param adaptersrc: Filepath to a list of reverse-complemented adapters.
        :param outdir: Filepath to the output directory.
        :param allowedns: Non-negative integer value indicating the maximum number of 'N's to tolerate in a sequence.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        debugPrintInputInfo(inputs, "trim adapters from")
        # "flexbar":  "flexbar -r \"%s\" -t \"%s\" -ae \"%s\" -a \"%s\"",
        printVerbose("Trimming barcodes and adapters with flexbar")
        temp_file_name_template = "%s/temp_%s"
        debarcoded_file_name_template = "%s/%s_debarcoded"
        # Trim adapters from the left
        run_parallel([ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR,
                                    [input_file, temp_file_name_template % (outdir, strip_ixes(input_file)),
                                     "LEFT", adapters, allowedns],
                                    {"exists": [input_file, adapters]}, extraargstring)
                      for input_file in inputs], pool)

        temp_files = getInputFiles(outdir, "temp_*")
        debugPrintInputInfo(temp_files, "trim adapters from")

        # Trim the reverse complemented adapters from the right
        run_parallel([ProgramRunner(ProgramRunnerCommands.TRIM_FLEXBAR,
                                    [input_file, debarcoded_file_name_template % (outdir, strip_ixes(input_file)[5:]),
                                     "RIGHT", adaptersrc, allowedns],
                                    {"exists": [input_file, adaptersrc]}, extraargstring)
                      for input_file in temp_files], pool)
        printVerbose("Done Trimming sequences.")

        # Move temp files
        aux_files = getInputFiles(outdir, "temp_*", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, makeAuxDir(outdir))
        cleanup_pool(pool)

Пример #16

0

Показать файл

Файл: Ungap_Program_Chewbacca.py Проект: gregorylburgess/Chewbacca

    def ungap_chewbacca(self, input_f, outdir, gapchars, file_ext, processes):
        """Ungaps a character using Bio python.

            :param input_f: Filepath to input file or folder to ungap.
            :param outdir: Filepath to the output directory where ungapped files should be written.
            :param gapchars: A string containing the gap characters to remove.
            :param file_ext: Either 'fasta' or 'fastq'.
            :param processes: The number of threads to use to ungap the input fileset.
        """
        inputs = getInputFiles(input_f, "*.fasta")
        debugPrintInputInfo(inputs, "ungap.")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Removing all '%s' from sequences..." % gapchars)
        # ungap(file_to_clean, output_file_name, gap_char, file_type):
        run_parallel([PythonRunner(remove_gap_chars,
                                   [input_, "%s/%s_cleaned.%s" % (outdir, strip_ixes(input_), 'fasta'),
                                   gapchars, file_ext],
                                   {"exists": [input_]}) for input_ in inputs], pool)
        printVerbose("Done removing.")
        cleanup_pool(pool)

Пример #17

0

Показать файл

Файл: Partition_Program_Chewbacca.py Проект: gregorylburgess/Chewbacca

    def partition_chewbacca(self, input_f, outdir, processes, chunksize, filetype):
        """Partition a fasta/fastq file into chunks of user-defined size.

        :param input_f: Filepath to a file or folder of files to partition.
        :param outdir: The directory to write split files to.
        :param processes: The number of processes to use to partition the input fileset.
        :param chunksize: The number of sequences per file.
        :param filetype: Either 'fasta' or 'fastq'.
        """
        # Gather input files
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "partitioned")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Partitioning Files...")
        run_parallel([PythonRunner(splitK,
                                   [input_, "%s/%s" % (outdir, strip_ixes(input_)), chunksize, filetype],
                                   {"exists": [input_]})
                  for input_ in inputs], pool)
        printVerbose("Done partitioning files.")
        cleanup_pool(pool)

Пример #18

0

Показать файл

    def rename_chewbacca(self, input_f, outdir, filetype, clip, processes):
        """Renames sequences in a fasta/fastq file as <filename>_ID0, <filename>_ID1, <filename>_ID2, etc., where
            <filename> is the name of the fasta/fastq file without any extensions or chewbacca suffixes.

        :param input_f: Filepath to an input file or folder to rename.
        :param outdir: Filepath to the output directory.
        :param filetype: Either 'fasta' or 'fastq'.
        :param clip: If True, remove dereplication counts from sequence names before renaming.
        :param processes: The maximum number of processes to use.
        """

        # Gather input files
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "rename")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Renaming sequences...")
        # Run serialRename in run_parallel
        run_parallel([
            PythonRunner(serialRename, [
                input_,
                "%s/%s_renamed%s" %
                (outdir, strip_ixes(input_), os.path.splitext(input_)[1]),
                filetype, clip
            ], {"exists": [input_]}) for input_ in inputs
        ], pool)
        printVerbose("Done renaming sequences...")

        samples_dir = makeDirOrdie("%s_samples" % outdir)
        samples_files = getInputFiles(outdir,
                                      "*.samples",
                                      ignore_empty_files=False)
        bulk_move_to_dir(samples_files, samples_dir)

        aux_dir = makeAuxDir(outdir)
        aux_files = getInputFiles(outdir,
                                  "*.mapping",
                                  ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)

        cleanup_pool(pool)

Пример #19

0

Показать файл

    def ungap_chewbacca(self, input_f, outdir, gapchars, file_ext, processes):
        """Ungaps a character using Bio python.

            :param input_f: Filepath to input file or folder to ungap.
            :param outdir: Filepath to the output directory where ungapped files should be written.
            :param gapchars: A string containing the gap characters to remove.
            :param file_ext: Either 'fasta' or 'fastq'.
            :param processes: The number of threads to use to ungap the input fileset.
        """
        inputs = getInputFiles(input_f, "*.fasta")
        debugPrintInputInfo(inputs, "ungap.")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Removing all '%s' from sequences..." % gapchars)
        # ungap(file_to_clean, output_file_name, gap_char, file_type):
        run_parallel([
            PythonRunner(remove_gap_chars, [
                input_,
                "%s/%s_cleaned.%s" %
                (outdir, strip_ixes(input_), 'fasta'), gapchars, file_ext
            ], {"exists": [input_]}) for input_ in inputs
        ], pool)
        printVerbose("Done removing.")
        cleanup_pool(pool)

Пример #20

0

Показать файл

    def partition_chewbacca(self, input_f, outdir, processes, chunksize,
                            filetype):
        """Partition a fasta/fastq file into chunks of user-defined size.

        :param input_f: Filepath to a file or folder of files to partition.
        :param outdir: The directory to write split files to.
        :param processes: The number of processes to use to partition the input fileset.
        :param chunksize: The number of sequences per file.
        :param filetype: Either 'fasta' or 'fastq'.
        """
        # Gather input files
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "partitioned")
        pool = init_pool(min(len(inputs), processes))
        printVerbose("Partitioning Files...")
        run_parallel([
            PythonRunner(splitK, [
                input_,
                "%s/%s" % (outdir, strip_ixes(input_)), chunksize, filetype
            ], {"exists": [input_]}) for input_ in inputs
        ], pool)
        printVerbose("Done partitioning files.")
        cleanup_pool(pool)

Пример #21

0

Показать файл

    def cluster_vsearch(self, input_f, outdir, groupsfile, processes, idpct, extraargstring):
        """Clusters sequences using SWARM.
        :param input_f: A file or folder containing fasta files to cluster.
        :param outdir: The output directory results will be written to.
        :param groupsfile: A groups file or folder containinggroups files that describe the input. Note: if no groups
                            file is supplied, then entries in the fasta file are assumed to be singleton sequences.
        :param idpct: Real number in the range (0,1] that specifies the minimum simmilarity threshold for
                            clustering.  e.g. .95 indicates that a candidate sequence 95% must be at least
                            95% simmilar to the seed sequence to be included in the cluster.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        # Grab the fasta file(s) to cluster
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clustered")
        pool = init_pool(min(len(inputs), processes))

        # RUN CLUSTERING
        # " --cluster_size %s -id %f --centroids %s  --uc %s",
        run_parallel([ProgramRunner(ProgramRunnerCommands.CLUSTER_VSEARCH,
                                    [input_, float(idpct), "%s/%s_seeds.fasta" % (outdir, strip_ixes(input_)),
                                     "%s/%s_clustered_uc" % (outdir, strip_ixes(input_))],
                                    {"exists": [input_]}, extraargstring) for input_ in inputs], pool)

        # PARSE UC FILE TO GROUPS FILE
        printVerbose("Parsing the clustered uc files to groups files")
        clustered_uc_files = getInputFiles(outdir, "*_clustered_uc")
        debugPrintInputInfo(clustered_uc_files, "parsed to groups")
        run_parallel([PythonRunner(parseUCtoGroups, [input_, "%s/%s.groups" % (outdir, strip_ixes(input_))],
                                   {"exists": [input_]})
                      for input_ in clustered_uc_files], pool)

        # REMOVE COUNTS FROM CLUSTERING GROUPS FILE
        printVerbose("Cleaning the .groups file from clustering")
        # Grab the current groups file and the new clustered groups file (which needs to be cleaned)
        clustered_groups_files = getInputFiles(outdir, "*_clustered.groups")
        # Remove counts from the clustering groups files
        debugPrintInputInfo(clustered_groups_files, "cleaned")
        run_parallel([PythonRunner(removeCountsFromGroupsFile,
                                   [input_, "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))],
                                   {"exists": [input_]})
                      for input_ in clustered_groups_files], pool)
        printVerbose("Done cleaning groups files.")

        # Collect the groups file from clustering with counts removed
        cleaned_clustered_groups_files = getInputFiles(outdir, "*_uncount.groups", ignore_empty_files=False)

        # Resolve the user specified names file if necessary
        final_groups_files = handle_groups_file_update(outdir, groupsfile, cleaned_clustered_groups_files)

        # Move the final groups file(s) to the groups dir
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(final_groups_files, groups_dir)

        # Move aux files to the aux dir
        aux_files = getInputFiles(outdir, "*", "*_seeds.fasta", ignore_empty_files=False)
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Cleanup the pool
        cleanup_pool(pool)

Пример #22

0

Показать файл

Файл: Cluster_Program_Swarm.py Проект: pythseq/Chewbacca

    def cluster_swarm(self, input_f, outdir, groupsfile, processes,
                      extraargstring):
        """Clusters sequences using SWARM.
        :param input_f: A file or folder containing fasta files to cluster.
        :param outdir: The output directory results will be written to.
        :param groupsfile: A groups file or folder containing groups files that describe the input. Note: if no groups
                            file is supplied, then entries in the fasta file are assumed to be singleton sequences.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """
        # Grab the fasta file(s) to cluster
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clustered")
        pool = init_pool(min(len(inputs), processes))

        # RUN CLUSTERING
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.CLUSTER_SWARM, [
                input_,
                "%s/%s_clustered" % (outdir, strip_ixes(input_)),
                "%s/%s_clustered_uc" % (outdir, strip_ixes(input_)),
                "%s/%s_clustered_seeds" % (outdir, strip_ixes(input_))
            ], {"exists": [input_]}, extraargstring) for input_ in inputs
        ], pool)

        # PARSE UC FILE TO GROUPS FILE
        printVerbose("Parsing the clustered uc files to groups files")
        clustered_uc_files = getInputFiles(outdir, "*_clustered_uc")
        debugPrintInputInfo(clustered_uc_files, "parsed to groups")
        run_parallel([
            PythonRunner(
                parseUCtoGroups,
                [input_, "%s/%s.groups" %
                 (outdir, strip_ixes(input_))], {"exists": [input_]})
            for input_ in clustered_uc_files
        ], pool)
        printVerbose("Done parsing groups files.")

        # REMOVE COUNTS FROM CLUSTERING GROUPS FILE
        printVerbose("Cleaning the .groups file from clustering")
        # Grab the current groups file and the new clustered groups file (which needs to be cleaned)
        clustered_groups_files = getInputFiles(outdir, "*_clustered.groups")
        debugPrintInputInfo(clustered_groups_files, "cleaned")
        run_parallel([
            PythonRunner(removeCountsFromGroupsFile, [
                input_,
                "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))
            ], {"exists": [input_]}) for input_ in clustered_groups_files
        ], pool)
        printVerbose("Done cleaning groups files.")

        printVerbose("Capitalizing sequences")
        # Convert the seeds files to uppercase (swarm writes in lowercase)
        inputs = getInputFiles(outdir, "*_seeds")
        run_parallel([
            PythonRunner(capitalize_seqs, [input_, "%s.fasta" % input_],
                         {"exists": [input_]}) for input_ in inputs
        ], pool)
        printVerbose("Done capitalizing sequences")

        # Collect the groups file from clustering with counts removed
        cleaned_clustered_groups_files = getInputFiles(
            outdir, "*_uncount.groups", ignore_empty_files=False)

        # Resolve the user specified names file if necessary
        final_groups_files = handle_groups_file_update(
            outdir, groupsfile, cleaned_clustered_groups_files)

        # Move the final groups file(s) to the groups dir
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(final_groups_files, groups_dir)

        # Move aux files to the aux dir
        aux_files = getInputFiles(outdir,
                                  "*",
                                  "*_seeds.fasta",
                                  ignore_empty_files=False)
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Cleanup the pool
        cleanup_pool(pool)

Пример #23

0

Показать файл

    def dereplicate_vsearch(self, input_f, outdir, groupsfile, processes,
                            stripcounts, extraargstring):
        """Dereplicates with vsearch.

        :param input_f: Filepath to the file or folder of files to dereplicate.
        :param outdir: Filepath to the output directory.
        :param groupsfile: A groups file to use as a reference for replicant counting.  If no groups file is
                            provided, input sequences are conidered singletons (regardless of their name-annotated
                            dereplication count).
        :param processes: The number of processes to use to dereplicate the fileset.
        :param stripcounts: If True, strips the trailing dereplication counts from a file before dereplication.
        :param extraargstring: Advanced program parameter string.
        """
        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        # REMOVES COUNTS FROM SEQUENCE NAMES IN ORDER TO CLUSTER PROPERLY
        # strip counts if we need to.
        if stripcounts:
            printVerbose("Removing counts from sequence names...")
            debugPrintInputInfo(inputs, "renamed")
            run_parallel([
                PythonRunner(removeCountsFromFastFile, [
                    input_,
                    "%s/%s_uncount.fasta" %
                    (outdir, strip_ixes(input_)), 'fasta'
                ], {"exists": input_}) for input_ in inputs
            ], pool)
            printVerbose("Done removing counts.")

            # Grab the cleaned files as input for the next step
            inputs = getInputFiles(outdir, "*_uncount.fasta")

        # DEREPLICATE
        debugPrintInputInfo(inputs, "dereplicated")
        printVerbose("Dereplicating...")
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.DEREP_VSEARCH, [
                processes, input_,
                "%s/%s_derep.fasta" % (outdir, strip_ixes(input_)),
                "%s/%s_uc.out" % (outdir, strip_ixes(input_))
            ], {
                "exists": [input_],
                "positive": [processes]
            }, extraargstring) for input_ in inputs
        ], pool)
        printVerbose("Done dereplicating")

        # LOG DEREPLICATED SEQUENCES INTO A .GROUPS FILE
        # generates a .groups file named _uc_parsed.out
        # python parseUCtoGroups.py uc.out uc_parsed.out
        input_ucs = getInputFiles(outdir, "*_uc.out")
        printVerbose("Generating a groups file from dereplication.")
        debugPrintInputInfo(inputs, "parsed (into a .groups file)")
        run_parallel([
            PythonRunner(
                parseUCtoGroups,
                [input_,
                 "%s/%s_derep.groups" %
                 (outdir, strip_ixes(input_))], {"exists": [input_]})
            for input_ in input_ucs
        ], pool)

        most_recent_groups_files = getInputFiles(outdir,
                                                 "*_derep.groups",
                                                 ignore_empty_files=False)

        # UPDATE THE MOST CURRENT GROUPS FILES WITH DEREPLICATION COUNTS
        if groupsfile is not None:
            # Grab the oldgroups file and the dereplicated groups file
            old_groups_files = getInputFiles(groupsfile)
            derep_groups_files = getInputFiles(outdir, "*_derep.groups")

            printVerbose("Updating .groups files with dereplicated data")
            printVerbose("%d Reference (old) groups files to be read:" %
                         len(old_groups_files))
            printVerbose(str(old_groups_files))
            printVerbose("%d Dereplicated (new) groups files to be read:" %
                         len(derep_groups_files))
            printVerbose(str(derep_groups_files))

            update_groups(old_groups_files, derep_groups_files, outdir,
                          "dereplicated")
            most_recent_groups_files = getInputFiles(outdir,
                                                     "dereplicated*",
                                                     ignore_empty_files=False)
            printVerbose("Done updating .groups files.")

        if len(inputs) != len(most_recent_groups_files):
            print(
                "Error: Number of input fastas (%d) is not equal to the number ofgroups files (%d)."
                % (len(inputs), len(most_recent_groups_files)))
            exit()
        fasta_groups_pairs = zip(inputs, most_recent_groups_files)
        # ADD COUNT TO SEQUENCE NAMES AND SORT BY COUNT
        # python renameWithReplicantCounts.py
        #               8_macse_out/MACSEOUT_MERGED.fasta uc_parsed.out dereplicated_renamed.fasta
        printVerbose("Adding dereplication data to unique fasta")
        run_parallel([
            PythonRunner(renameWithReplicantCounts, [
                fasta, groups,
                "%s/%s_counts.fasta" % (outdir, strip_ixes(fasta)), 'fasta'
            ], {"exists": [fasta, groups]})
            for fasta, groups in fasta_groups_pairs
        ], pool)
        printVerbose("Done adding data")

        aux_dir = makeAuxDir(outdir)
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(most_recent_groups_files, groups_dir)
        aux_files = getInputFiles(outdir,
                                  '*',
                                  "*_counts.fasta",
                                  ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)
        cleanup_pool(pool)

Пример #24

0

Показать файл

    def cluster_crop(self, input_f, outdir, groupsfile, processes, blocksize,
                     clustpct, maxmcmc, maxsm, rare, blockcount,
                     extraargstring):
        """Clusters sequences using CROP.

        :param input_f: Filepath to the input fasta file to cluster.
        :param outdir: Filepath to the output directory.
        :param groupsfile: Filepath to the groups file to use as a reference for dereplication counting.
        :param blocksize: Size of blocks to be used for all rounds (if -b is specified, then -z will not affect the
                            first round.  For data set with different average sequence length, this parameter should \
                            be tuned such that it won't take too long for each block to do pariwise alignment.  Hint \
                            for choosing z: z*L<150,000, where L is the average length of the sequences.
        :param clustpct: The minimum similarity threshold for clustering.  Either 'g' for 95% or 's' for 97%.
        :param maxmcmc: This parameter specifies the number of iterations of MCMC. Default value is 2000. Increase \
                            this value to enhance accuracy (recommended value is at least 10*block size).
        :param maxsm: This parameter specifies the maximum number of 'split and merge' process to run.  Max is 20.
        :param rare: The maximum cluster size allowed to be classified as 'rare'. Clusters are defined as either \
                            'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \
                            clusters are mapped to the 'abundant' clusters.  Finally, 'rare' clusters which cannot be \
                            mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \
                            considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \
                            believe your data is not too diverse to be handled, then r=0 will be the best choice.
        :param blockcount: The size of blocks in the first round of clustering. Hint of choosing -b: Each block in the \
                            first round should contain about 50 sequences.  i.e. b=N/50, where N is the number of \
                            input sequences.  Default: # input sequences / z.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """

        # Grab the fasta file(s) to cluster
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clustered")
        pool = init_pool(min(len(inputs), processes))

        # RUN CLUSTERING
        # crop -i %s -o %s -z %s -c %s -e %s -m %s%s
        run_parallel([
            ProgramRunner(ProgramRunnerCommands.CLUSTER_CROP, [
                input_,
                "%s/%s" % (outdir, strip_ixes(input_)), blocksize, clustpct,
                maxmcmc, maxsm, rare, blockcount
            ], {"exists": [input_]}, extraargstring) for input_ in inputs
        ], pool)

        # CLEAN THE OUTPUT GROUPS FILE
        printVerbose("Parsing the groups file from clustering")
        clustered_groups_files = getInputFiles(outdir, "*.cluster.list")
        debugPrintInputInfo(clustered_groups_files,
                            "converted to groups files")
        run_parallel([
            PythonRunner(parseCROPoutToGroups, [
                input_,
                "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))
            ], {"exists": [input_]}) for input_ in clustered_groups_files
        ], pool)
        printVerbose("Done parsing groups file.")

        # Collect the groups file from clustering with counts removed
        cleaned_clustered_groups_files = getInputFiles(
            outdir, "*_uncount.groups", ignore_empty_files=False)

        # Resolve the user specified names file if necessary
        final_groups_files = handle_groups_file_update(
            outdir, groupsfile, cleaned_clustered_groups_files)

        # GATHER AUX FILES
        input_dir = getDirName(input_f)
        aux_files = cleaned_clustered_groups_files
        aux_files += getInputFiles(input_dir,
                                   "*.unique",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(input_dir,
                                   "*.unique.list",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(input_dir,
                                   "*.unique.TempCenters.Rare",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(outdir,
                                   "*.cluster",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(outdir,
                                   "*.cluster.list",
                                   ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.log", ignore_empty_files=False)
        aux_files += getInputFiles(".",
                                   "LikelihoodRatio.txt",
                                   ignore_empty_files=False)

        # Move the final groups file(s) to the groups dir
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(final_groups_files, groups_dir)

        # Move aux files to the aux dir
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Cleanup the pool
        cleanup_pool(pool)

Пример #25

0

Показать файл

Файл: Dereplicate_Program_Vsearch.py Проект: gregorylburgess/Chewbacca

    def dereplicate_vsearch(self, input_f, outdir, groupsfile, processes, stripcounts, extraargstring):
        """Dereplicates with vsearch.

        :param input_f: Filepath to the file or folder of files to dereplicate.
        :param outdir: Filepath to the output directory.
        :param groupsfile: A groups file to use as a reference for replicant counting.  If no groups file is
                            provided, input sequences are conidered singletons (regardless of their name-annotated
                            dereplication count).
        :param processes: The number of processes to use to dereplicate the fileset.
        :param stripcounts: If True, strips the trailing dereplication counts from a file before dereplication.
        :param extraargstring: Advanced program parameter string.
        """
        inputs = getInputFiles(input_f)
        pool = init_pool(min(len(inputs), processes))
        # REMOVES COUNTS FROM SEQUENCE NAMES IN ORDER TO CLUSTER PROPERLY
        # strip counts if we need to.
        if stripcounts:
            printVerbose("Removing counts from sequence names...")
            debugPrintInputInfo(inputs, "renamed")
            run_parallel([PythonRunner(removeCountsFromFastFile,
                                       [input_, "%s/%s_uncount.fasta" % (outdir, strip_ixes(input_)), 'fasta'],
                                       {"exists": input_})
                          for input_ in inputs], pool)
            printVerbose("Done removing counts.")

            # Grab the cleaned files as input for the next step
            inputs = getInputFiles(outdir, "*_uncount.fasta")

        # DEREPLICATE
        debugPrintInputInfo(inputs, "dereplicated")
        printVerbose("Dereplicating...")
        run_parallel([ProgramRunner(ProgramRunnerCommands.DEREP_VSEARCH,
                                    [processes, input_,
                                     "%s/%s_derep.fasta" % (outdir, strip_ixes(input_)),
                                     "%s/%s_uc.out" % (outdir, strip_ixes(input_))],
                                    {"exists": [input_], "positive": [processes]},
                                    extraargstring)
                      for input_ in inputs], pool)
        printVerbose("Done dereplicating")

        # LOG DEREPLICATED SEQUENCES INTO A .GROUPS FILE
        # generates a .groups file named _uc_parsed.out
        # python parseUCtoGroups.py uc.out uc_parsed.out
        input_ucs = getInputFiles(outdir, "*_uc.out")
        printVerbose("Generating a groups file from dereplication.")
        debugPrintInputInfo(inputs, "parsed (into a .groups file)")
        run_parallel([PythonRunner(parseUCtoGroups, [input_, "%s/%s_derep.groups" % (outdir, strip_ixes(input_))],
                                   {"exists": [input_]})
                      for input_ in input_ucs], pool)

        most_recent_groups_files = getInputFiles(outdir, "*_derep.groups", ignore_empty_files=False)

        # UPDATE THE MOST CURRENT GROUPS FILES WITH DEREPLICATION COUNTS
        if groupsfile is not None:
            # Grab the oldgroups file and the dereplicated groups file
            old_groups_files = getInputFiles(groupsfile)
            derep_groups_files = getInputFiles(outdir, "*_derep.groups")

            printVerbose("Updating .groups files with dereplicated data")
            printVerbose("%d Reference (old) groups files to be read:" % len(old_groups_files))
            printVerbose(str(old_groups_files))
            printVerbose("%d Dereplicated (new) groups files to be read:" % len(derep_groups_files))
            printVerbose(str(derep_groups_files))

            update_groups(old_groups_files, derep_groups_files, outdir, "dereplicated")
            most_recent_groups_files = getInputFiles(outdir, "dereplicated*", ignore_empty_files=False)
            printVerbose("Done updating .groups files.")

        if len(inputs) != len(most_recent_groups_files):
            print ("Error: Number of input fastas (%d) is not equal to the number ofgroups files (%d)." %
                   (len(inputs), len(most_recent_groups_files)))
            exit()
        fasta_groups_pairs = zip(inputs, most_recent_groups_files)
        # ADD COUNT TO SEQUENCE NAMES AND SORT BY COUNT
        # python renameWithReplicantCounts.py
        #               8_macse_out/MACSEOUT_MERGED.fasta uc_parsed.out dereplicated_renamed.fasta
        printVerbose("Adding dereplication data to unique fasta")
        run_parallel([PythonRunner(renameWithReplicantCounts,
                                   [fasta, groups, "%s/%s_counts.fasta" % (outdir, strip_ixes(fasta)), 'fasta'],
                                   {"exists": [fasta, groups]})
                      for fasta, groups in fasta_groups_pairs], pool)
        printVerbose("Done adding data")

        aux_dir = makeAuxDir(outdir)
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(most_recent_groups_files, groups_dir)
        aux_files = getInputFiles(outdir, '*', "*_counts.fasta", ignore_empty_files=False)
        bulk_move_to_dir(aux_files, aux_dir)
        cleanup_pool(pool)

Пример #26

0

Показать файл

Файл: Visualize_OTU_Heatmap_Program_Chewbacca.py Проект: gregorylburgess/Chewbacca

 def execute_program(self):
     args = self.args
     input_file = getInputFiles(args.input_f)[0]
     output_file = "%s/%s.png" % (args.outdir, strip_ixes(input_file))
     data_frame = subset_dataframe(args.input_f, args)
     self.visualize_otu_heatmap(data_frame, output_file)

Пример #27

0

Показать файл

 def execute_program(self):
     args = self.args
     input_file = getInputFiles(args.input_f)[0]
     output_file = "%s/%s.png" % (args.outdir, strip_ixes(input_file))
     data_frame = subset_dataframe(args.input_f, args)
     self.visualize_otu_sample_comp(data_frame, output_file)

Пример #28

0

Показать файл

Файл: Cluster_Program_Crop.py Проект: gregorylburgess/Chewbacca

    def cluster_crop(self, input_f, outdir, groupsfile, processes, blocksize, clustpct, maxmcmc, maxsm, rare,
                     blockcount, extraargstring):
        """Clusters sequences using CROP.

        :param input_f: Filepath to the input fasta file to cluster.
        :param outdir: Filepath to the output directory.
        :param groupsfile: Filepath to the groups file to use as a reference for dereplication counting.
        :param blocksize: Size of blocks to be used for all rounds (if -b is specified, then -z will not affect the
                            first round.  For data set with different average sequence length, this parameter should \
                            be tuned such that it won't take too long for each block to do pariwise alignment.  Hint \
                            for choosing z: z*L<150,000, where L is the average length of the sequences.
        :param clustpct: The minimum similarity threshold for clustering.  Either 'g' for 95% or 's' for 97%.
        :param maxmcmc: This parameter specifies the number of iterations of MCMC. Default value is 2000. Increase \
                            this value to enhance accuracy (recommended value is at least 10*block size).
        :param maxsm: This parameter specifies the maximum number of 'split and merge' process to run.  Max is 20.
        :param rare: The maximum cluster size allowed to be classified as 'rare'. Clusters are defined as either \
                            'abundant' or 'rare'. 'Abundant' clusters will be clustered first, then the 'rare' \
                            clusters are mapped to the 'abundant' clusters.  Finally, 'rare' clusters which cannot be \
                            mapped will be clustered separately. e.g. If r=5, the clusters with size <=5 will be \
                            considered 'rare' in above procedure. and r=0 will yield the best accuracy. If you \
                            believe your data is not too diverse to be handled, then r=0 will be the best choice.
        :param blockcount: The size of blocks in the first round of clustering. Hint of choosing -b: Each block in the \
                            first round should contain about 50 sequences.  i.e. b=N/50, where N is the number of \
                            input sequences.  Default: # input sequences / z.
        :param processes: The maximum number of processes to use.
        :param extraargstring: Advanced program parameter string.
        """

        # Grab the fasta file(s) to cluster
        inputs = getInputFiles(input_f)
        debugPrintInputInfo(inputs, "clustered")
        pool = init_pool(min(len(inputs), processes))

        # RUN CLUSTERING
        # crop -i %s -o %s -z %s -c %s -e %s -m %s%s
        run_parallel([ProgramRunner(ProgramRunnerCommands.CLUSTER_CROP,
                                    [input_, "%s/%s" % (outdir, strip_ixes(input_)), blocksize, clustpct,
                                        maxmcmc, maxsm, rare, blockcount],
                                    {"exists": [input_]}, extraargstring) for input_ in inputs], pool)

        # CLEAN THE OUTPUT GROUPS FILE
        printVerbose("Parsing the groups file from clustering")
        clustered_groups_files = getInputFiles(outdir, "*.cluster.list")
        debugPrintInputInfo(clustered_groups_files, "converted to groups files")
        run_parallel([PythonRunner(parseCROPoutToGroups, [input_,
                                   "%s/%s_uncount.groups" % (outdir, strip_ixes(input_))],
                                   {"exists": [input_]})
                      for input_ in clustered_groups_files], pool)
        printVerbose("Done parsing groups file.")

        # Collect the groups file from clustering with counts removed
        cleaned_clustered_groups_files = getInputFiles(outdir, "*_uncount.groups", ignore_empty_files=False)

        # Resolve the user specified names file if necessary
        final_groups_files = handle_groups_file_update(outdir, groupsfile, cleaned_clustered_groups_files)

        # GATHER AUX FILES
        input_dir = getDirName(input_f)
        aux_files = cleaned_clustered_groups_files
        aux_files += getInputFiles(input_dir, "*.unique", ignore_empty_files=False)
        aux_files += getInputFiles(input_dir, "*.unique.list", ignore_empty_files=False)
        aux_files += getInputFiles(input_dir, "*.unique.TempCenters.Rare", ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.cluster", ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.cluster.list", ignore_empty_files=False)
        aux_files += getInputFiles(outdir, "*.log", ignore_empty_files=False)
        aux_files += getInputFiles(".", "LikelihoodRatio.txt", ignore_empty_files=False)

        # Move the final groups file(s) to the groups dir
        groups_dir = makeDirOrdie("%s_groups_files" % outdir)
        bulk_move_to_dir(final_groups_files, groups_dir)

        # Move aux files to the aux dir
        aux_dir = makeAuxDir(outdir)
        bulk_move_to_dir(aux_files, aux_dir)

        # Cleanup the pool
        cleanup_pool(pool)

Python strip_ixes примеры использования