예제 #1
0
def get_flowgram_distances(id,
                           flowgram,
                           flowgrams,
                           fc,
                           ids,
                           outdir,
                           error_profile=DENOISER_DATA_DIR +
                           'FLX_error_profile.dat'):
    """Computes distance scores of flowgram to all flowgrams in parser.

    id: The flowgram identifier, also used to name intermediate files

    flowgram: This flowgram is used to filter all the other flowgrams

    flowgrams: iterable filehandle of flowgram file

    fc: a sink for flowgrams, either a FlowgramContainerArray or
        FlowgramContainerFile object

    ids: dict of ids of flowgrams in flowgrams that should  be aligned

    outdir: directory for intermediate files

    error_profile: path to error profile *.dat file
    """
    check_flowgram_ali_exe()
    # File that serves as input for external alignment program
    (fh, tmpfile) = init_flowgram_file(prefix=outdir)
    append_to_flowgram_file(id, flowgram, fh)

    k = 0
    names = []
    for f in flowgrams:
        if (f.Name in ids):
            fc.add(f)
            append_to_flowgram_file(f.Name, f, fh, trim=False)
            k += 1
            names.append(f.Name)
    fh.close()

    # TODO: capture stderr and warn user
    scores_fh = popen(
        "%s -relscore_pairid %s %s " %
        (get_flowgram_ali_exe(), error_profile, tmpfile), 'r')
    scores = [map(float, (s.split())) for s in scores_fh if s != "\n"]

    if (k != len(scores)):
        raise RuntimeError(
            "Something bad has happened! I received less " +
            "alignment scores than there are flowgrams. Most likely this " +
            "means that the alignment program is not setup or corrupted. " +
            "Please run the test scripts to figure out the cause of the error."
        )

    remove(tmpfile)

    return (scores, names, fc)
예제 #2
0
    def test_denoiser_supported_version(self):
        """denoiser aligner is ready to use """

        pass_test = True
        try:
            check_flowgram_ali_exe()
        except (ApplicationNotFoundError, ApplicationError):
            pass_test = False
            
        self.assertTrue(pass_test, "Denoiser flowgram aligner not found or not executable."+\
                            "This may or may not be a problem depending on "+\
                            "which components of QIIME you plan to use.")
예제 #3
0
    def test_denoiser_supported_version(self):
        """denoiser aligner is ready to use """

        pass_test = True
        try:
            check_flowgram_ali_exe()
        except (ApplicationNotFoundError, ApplicationError):
            pass_test = False
            
        self.assertTrue(pass_test, "Denoiser flowgram aligner not found or not executable."+\
                            "This may or may not be a problem depending on "+\
                            "which components of QIIME you plan to use.")
예제 #4
0
def get_flowgram_distances(id, flowgram, flowgrams, fc, ids, outdir,
                           error_profile=DENOISER_DATA_DIR +
                           'FLX_error_profile.dat'):
    """Computes distance scores of flowgram to all flowgrams in parser.

    id: The flowgram identifier, also used to name intermediate files

    flowgram: This flowgram is used to filter all the other flowgrams

    flowgrams: iterable filehandle of flowgram file

    fc: a sink for flowgrams, either a FlowgramContainerArray or
        FlowgramContainerFile object

    ids: dict of ids of flowgrams in flowgrams that should  be aligned

    outdir: directory for intermediate files

    error_profile: path to error profile *.dat file
    """
    check_flowgram_ali_exe()
    # File that serves as input for external alignment program
    (fh, tmpfile) = init_flowgram_file(prefix=outdir)
    append_to_flowgram_file(id, flowgram, fh)

    k = 0
    names = []
    for f in flowgrams:
        if(f.Name in ids):
            fc.add(f)
            append_to_flowgram_file(f.Name, f, fh, trim=False)
            k += 1
            names.append(f.Name)
    fh.close()

    # TODO: capture stderr and warn user
    scores_fh = popen("%s -relscore_pairid %s %s " %
                      (get_flowgram_ali_exe(),
                       error_profile, tmpfile), 'r')
    scores = [map(float, (s.split())) for s in scores_fh if s != "\n"]

    if (k != len(scores)):
        raise RuntimeError("Something bad has happened! I received less " +
                           "alignment scores than there are flowgrams. Most likely this " +
                           "means that the alignment program is not setup or corrupted. " +
                           "Please run the test scripts to figure out the cause of the error.")

    remove(tmpfile)

    return (scores, names, fc)
예제 #5
0
    def setUp(self):
        # abort all tests without the alignment binary
        check_flowgram_ali_exe()

        signal.signal(signal.SIGALRM, timeout)
        # set the 'alarm' to go off in allowed_seconds seconds
        signal.alarm(allowed_seconds_per_test)

        self.test_dir = "denoiser_main_test" + make_tmp_name() + "/"
        self.expected = ">FS8APND01D3TW3 | cluster size: 94 \nCTGGGCCGTATCTCAGTCCCAATGTGGCCGGTCACCCTCTCAGGCCGGCTACCCGTCAAAGCCTTGGTAAGCCACTACCCCACCAACAAGCTGATAAGCCGCGAGTCCATCCCCAACCGCCGAAACTTTCCAACCCCCACCATGCAGCAGGAGCTCCTATCCGGTATTAGCCCCAGTTTCCTGAAGTTATCCCAAAGTCAAGGGCAGGTTACTCACGTGTTACTCACCCGTTCGCC\n"
        self.expected_map_string = """FS8APND01EWRS4:
FS8APND01BSTVP:
FS8APND01DXG45:
FS8APND01D3TW3:\tFS8APND01CSXFN\tFS8APND01DQ8MX\tFS8APND01DY7QW\tFS8APND01B5QNI\tFS8APND01CQ6OG\tFS8APND01C7IGN\tFS8APND01DHSGH\tFS8APND01DJ17E\tFS8APND01CUXOA\tFS8APND01EUTYG\tFS8APND01EKK7T\tFS8APND01D582W\tFS8APND01B5GWU\tFS8APND01D7N2A\tFS8APND01BJGHZ\tFS8APND01D6DYZ\tFS8APND01C6ZIM\tFS8APND01D2X6Y\tFS8APND01BUYCE\tFS8APND01BNUEY\tFS8APND01DKLOE\tFS8APND01C24PP\tFS8APND01EBWQX\tFS8APND01ELDYW\tFS8APND01B0GCS\tFS8APND01D4QXI\tFS8APND01EMYD9\tFS8APND01EA2SK\tFS8APND01DZOSO\tFS8APND01DHYAZ\tFS8APND01C7UD9\tFS8APND01BTZFV\tFS8APND01CR78R\tFS8APND01B39IE\tFS8APND01ECVC0\tFS8APND01DM3PL\tFS8APND01DELWS\tFS8APND01CIEK8\tFS8APND01D7ZOZ\tFS8APND01CZSAI\tFS8APND01DYOVR\tFS8APND01BX9XY\tFS8APND01DEWJA\tFS8APND01BEKIW\tFS8APND01DCKB9\tFS8APND01EEYIS\tFS8APND01DDKEA\tFS8APND01DSZLO\tFS8APND01C6EBC\tFS8APND01EE15M\tFS8APND01ELO9B\tFS8APND01C58QY\tFS8APND01DONCG\tFS8APND01DVXX2\tFS8APND01BL5YT\tFS8APND01BIL2V\tFS8APND01EBSYQ\tFS8APND01CCX8R\tFS8APND01B2YCJ\tFS8APND01B1JG4\tFS8APND01DJ024\tFS8APND01BIJY0\tFS8APND01CIA4G\tFS8APND01DV74M\tFS8APND01ECAX5\tFS8APND01DC3TZ\tFS8APND01EJVO6\tFS8APND01D4VFG\tFS8APND01DYYYO\tFS8APND01D1EDD\tFS8APND01DQUOT\tFS8APND01A2NSJ\tFS8APND01DDC8I\tFS8APND01BP1T2\tFS8APND01DPY6U\tFS8APND01CIQGV\tFS8APND01BPUT8\tFS8APND01BDNH4\tFS8APND01DOZDN\tFS8APND01DS866\tFS8APND01DGS2J\tFS8APND01EDK32\tFS8APND01EPA0T\tFS8APND01CK3JM\tFS8APND01BKLWW\tFS8APND01DV0BO\tFS8APND01DPNXE\tFS8APND01B7LUA\tFS8APND01BTTE2\tFS8APND01CKO4X\tFS8APND01DGGBY\tFS8APND01C4NHX\tFS8APND01DYPQN
FS8APND01EFK0W:
FS8APND01DCIOO:
FS8APND01CKOMZ:
"""
        self.expected_titanium_map_string = """FS8APND01EWRS4:
예제 #6
0
    def setUp(self):
        # abort all tests without the alignment binary
        check_flowgram_ali_exe()

        signal.signal(signal.SIGALRM, timeout)
        # set the 'alarm' to go off in allowed_seconds seconds
        signal.alarm(allowed_seconds_per_test)

        self.test_dir = "denoiser_main_test" + make_tmp_name() + "/"
        self.expected = ">FS8APND01D3TW3 | cluster size: 94 \nCTGGGCCGTATCTCAGTCCCAATGTGGCCGGTCACCCTCTCAGGCCGGCTACCCGTCAAAGCCTTGGTAAGCCACTACCCCACCAACAAGCTGATAAGCCGCGAGTCCATCCCCAACCGCCGAAACTTTCCAACCCCCACCATGCAGCAGGAGCTCCTATCCGGTATTAGCCCCAGTTTCCTGAAGTTATCCCAAAGTCAAGGGCAGGTTACTCACGTGTTACTCACCCGTTCGCC\n"
        self.expected_map_string = """FS8APND01EWRS4:
FS8APND01BSTVP:
FS8APND01DXG45:
FS8APND01D3TW3:\tFS8APND01CSXFN\tFS8APND01DQ8MX\tFS8APND01DY7QW\tFS8APND01B5QNI\tFS8APND01CQ6OG\tFS8APND01C7IGN\tFS8APND01DHSGH\tFS8APND01DJ17E\tFS8APND01CUXOA\tFS8APND01EUTYG\tFS8APND01EKK7T\tFS8APND01D582W\tFS8APND01B5GWU\tFS8APND01D7N2A\tFS8APND01BJGHZ\tFS8APND01D6DYZ\tFS8APND01C6ZIM\tFS8APND01D2X6Y\tFS8APND01BUYCE\tFS8APND01BNUEY\tFS8APND01DKLOE\tFS8APND01C24PP\tFS8APND01EBWQX\tFS8APND01ELDYW\tFS8APND01B0GCS\tFS8APND01D4QXI\tFS8APND01EMYD9\tFS8APND01EA2SK\tFS8APND01DZOSO\tFS8APND01DHYAZ\tFS8APND01C7UD9\tFS8APND01BTZFV\tFS8APND01CR78R\tFS8APND01B39IE\tFS8APND01ECVC0\tFS8APND01DM3PL\tFS8APND01DELWS\tFS8APND01CIEK8\tFS8APND01D7ZOZ\tFS8APND01CZSAI\tFS8APND01DYOVR\tFS8APND01BX9XY\tFS8APND01DEWJA\tFS8APND01BEKIW\tFS8APND01DCKB9\tFS8APND01EEYIS\tFS8APND01DDKEA\tFS8APND01DSZLO\tFS8APND01C6EBC\tFS8APND01EE15M\tFS8APND01ELO9B\tFS8APND01C58QY\tFS8APND01DONCG\tFS8APND01DVXX2\tFS8APND01BL5YT\tFS8APND01BIL2V\tFS8APND01EBSYQ\tFS8APND01CCX8R\tFS8APND01B2YCJ\tFS8APND01B1JG4\tFS8APND01DJ024\tFS8APND01BIJY0\tFS8APND01CIA4G\tFS8APND01DV74M\tFS8APND01ECAX5\tFS8APND01DC3TZ\tFS8APND01EJVO6\tFS8APND01D4VFG\tFS8APND01DYYYO\tFS8APND01D1EDD\tFS8APND01DQUOT\tFS8APND01A2NSJ\tFS8APND01DDC8I\tFS8APND01BP1T2\tFS8APND01DPY6U\tFS8APND01CIQGV\tFS8APND01BPUT8\tFS8APND01BDNH4\tFS8APND01DOZDN\tFS8APND01DS866\tFS8APND01DGS2J\tFS8APND01EDK32\tFS8APND01EPA0T\tFS8APND01CK3JM\tFS8APND01BKLWW\tFS8APND01DV0BO\tFS8APND01DPNXE\tFS8APND01B7LUA\tFS8APND01BTTE2\tFS8APND01CKO4X\tFS8APND01DGGBY\tFS8APND01C4NHX\tFS8APND01DYPQN
FS8APND01EFK0W:
FS8APND01DCIOO:
FS8APND01CKOMZ:
"""
        self.expected_titanium_map_string = """FS8APND01EWRS4:
예제 #7
0
def denoise_per_sample(sff_fps, fasta_fp, tmpoutdir, cluster=False,
                       num_cpus=1, squeeze=True, percent_id=0.97, bail=1,
                       primer="", low_cutoff=3.75, high_cutoff=4.5,
                       log_fp="denoiser.log", low_memory=False, verbose=False,
                       error_profile=DENOISER_DATA_DIR +
                       'FLX_error_profile.dat',
                       max_num_rounds=None, titanium=False):
    """Denoise each sample separately"""

    # abort early if binary is missing
    check_flowgram_ali_exe()

    log_fh = None
    if log_fp:
        # switch of buffering for global log file
        log_fh = open(tmpoutdir + "/" + log_fp, "w", 0)

    # overwrite settings if titanium is set
    # This flag is only used from qiime. Remove after qiime integration
    if titanium:
        error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat"
        low_cutoff = 4
        high_cutoff = 5

    if verbose:
        log_fh.write("Denoiser version: %s\n" % __version__)
        log_fh.write("SFF files: %s\n" % ', '.join(sff_fps))
        log_fh.write("Fasta file: %s\n" % fasta_fp)
        log_fh.write("Cluster: %s\n" % cluster)
        log_fh.write("Num CPUs: %d\n" % num_cpus)
        log_fh.write("Squeeze Seqs: %s\n" % squeeze)
        log_fh.write("tmpdir: %s\n\n" % tmpoutdir)
        log_fh.write("percent_id threshold: %.2f\n" % percent_id)
        log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail)
        log_fh.write("Error profile: %s\n" % error_profile)
        log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds)

    # here we go ...
    sff_files = split_sff(map(open, sff_fps), open(fasta_fp), tmpoutdir)
    combined_mapping = {}
    result_centroids = []
    result_singletons_files = []
    # denoise each sample separately
    for i, sff_file in enumerate(sff_files):
        if not exists(tmpoutdir + ("/%d" % i)):
            makedirs(tmpoutdir + ("/%d" % i))
        out_fp = tmpoutdir + ("/%d/" % i)
        denoise_seqs([sff_file], fasta_fp, out_fp, None, cluster,
                     num_cpus, squeeze, percent_id, bail, primer,
                     low_cutoff, high_cutoff, log_fp, low_memory,
                     verbose, error_profile, max_num_rounds)

        # collect partial results
        this_rounds_mapping = read_denoiser_mapping(
            open(out_fp + "/denoiser_mapping.txt"))
        combined_mapping.update(this_rounds_mapping)
        result_centroids.append(
            parse_fasta(open(out_fp + "/centroids.fasta")))
        result_singletons_files.append(out_fp + "/singletons.fasta")

    # write the combined files
    store_mapping(combined_mapping, tmpoutdir, "denoiser")
    seqs = chain(*result_centroids)
    fasta_fh = open(tmpoutdir + "/denoised.fasta", "w")
    # write centroids sorted by clustersize
    write_Fasta_from_name_seq_pairs(
        sort_seqs_by_clustersize(seqs, combined_mapping),
        fasta_fh)
    for singleton_file in result_singletons_files:
        write_Fasta_from_name_seq_pairs(
            parse_fasta(open(singleton_file, "r")),
            fasta_fh)
    fasta_fh.close()

    # return outdir for tests/test_denoiser
    return tmpoutdir
예제 #8
0
def denoise_seqs(
        sff_fps, fasta_fp, tmpoutdir, preprocess_fp=None, cluster=False,
        num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="",
        low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log",
        low_memory=False, verbose=False,
        error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat',
        max_num_rounds=None, titanium=False, checkpoint_fp=None):
    """The main routine to denoise flowgrams"""

    # abort if binary is missing
    check_flowgram_ali_exe()

    if verbose:
        # switch of buffering for log file
        log_fh = open(tmpoutdir + "/" + log_fp, "w", 0)
    else:
        log_fh = None

    # overwrite settings if titanium is set
    # This flag is only used from qiime. Remove after qiime integration
    if titanium:
        error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat"
        low_cutoff = 4
        high_cutoff = 5

    if verbose:
        log_fh.write("Denoiser version: %s\n" % __version__)
        log_fh.write("SFF files: %s\n" % ', '.join(sff_fps))
        log_fh.write("Fasta file: %s\n" % fasta_fp)
        log_fh.write("Preprocess dir: %s\n" % preprocess_fp)
        if checkpoint_fp:
            log_fh.write("Resuming denoiser from %s\n" % checkpoint_fp)
        log_fh.write("Primer sequence: %s\n" % primer)
        log_fh.write("Running on cluster: %s\n" % cluster)
        log_fh.write("Num CPUs: %d\n" % num_cpus)
        log_fh.write("Squeeze Seqs: %s\n" % squeeze)
        log_fh.write("tmpdir: %s\n" % tmpoutdir)
        log_fh.write("percent_id threshold: %.2f\n" % percent_id)
        log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail)
        log_fh.write("Low cut-off: %.2f\n" % low_cutoff)
        log_fh.write("High cut-off: %.2f\n" % high_cutoff)
        log_fh.write("Error profile: %s\n" % error_profile)
        log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds)

    # here we go ...
    # Phase I - clean up and truncate input sff
    if(checkpoint_fp):
        if (preprocess_fp):
            # skip preprocessing as we should have data
            # we already have preprocessed data, so use it
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(preprocess_fp)
        else:
            raise ApplicationError(
                "Resuming from checkpoint requires --preprocess option")

    else:
        if(preprocess_fp):
            # we already have preprocessed data, so use it
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(preprocess_fp)
        elif(cluster):
            preprocess_on_cluster(sff_fps, log_fp, fasta_fp=fasta_fp,
                                  out_fp=tmpoutdir, verbose=verbose,
                                  squeeze=squeeze, primer=primer)
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(tmpoutdir)
        else:
            (deprefixed_sff_fp, l, mapping, seqs) = \
                preprocess(
                    sff_fps, log_fh, fasta_fp=fasta_fp, out_fp=tmpoutdir,
                    verbose=verbose, squeeze=squeeze, primer=primer)

        # preprocessor writes into same file, so better jump to end of file
        if verbose:
            log_fh.close()
            log_fh = open(tmpoutdir + "/" + log_fp, "a", 0)

    # phase II:
    # use prefix map based clustering as initial centroids and greedily
    # add flowgrams to clusters with a low threshold

    (new_sff_file, bestscores, mapping) = \
        greedy_clustering(deprefixed_sff_fp, seqs, mapping, tmpoutdir, l,
                          log_fh, num_cpus=num_cpus, on_cluster=cluster,
                          bail_out=bail, pair_id_thresh=percent_id,
                          threshold=low_cutoff, verbose=verbose,
                          fast_method=not low_memory,
                          error_profile=error_profile,
                          max_num_rounds=max_num_rounds,
                          checkpoint_fp=checkpoint_fp)

    # phase III phase:
    # Assign seqs to nearest existing centroid with high threshold
    secondary_clustering(new_sff_file, mapping, bestscores, log_fh,
                         verbose=verbose, threshold=high_cutoff)
    remove(new_sff_file)
    if (verbose):
        log_fh.write("Finished clustering\n")
        log_fh.write("Writing Clusters\n")
        log_fh.write(make_stats(mapping) + "\n")
    store_clusters(mapping, deprefixed_sff_fp, tmpoutdir)
    store_mapping(mapping, tmpoutdir, "denoiser")
예제 #9
0
def get_flowgram_distances_on_cluster(
        id, flowgram, flowgrams, fc, ids, num_cores,
        num_flows, spread, client_sockets=[]):
    """Computes distance scores of flowgram to all flowgrams in parser.

    id: The flowgram identifier, also used to name intermediate files

    flowgram: This flowgram is used to filter all the other flowgrams

    flowgrams: iterable filehandle of flowgram file

    fc: a sink of flowgrams, which serves as source in the next round

    ids: list of flowgram ids that should be used from flowgrams

    num_cores: number of cpus

    num_flows: Number of flows in parser

    client_sockets: A list of open sockets for client-server communication

    spread: historical distribution of processing runtimes

    """
    epoch = time()

    check_flowgram_ali_exe()

    qiime_config = load_qiime_config()
    min_per_core = int(qiime_config['denoiser_min_per_core'])
    # if using from future import division this has to be checked,
    # as we want true integer division here

    per_core = max(min_per_core, (num_flows / num_cores) + 1)
    names = []
    scores = []

    # Need to call this here, since we iterate over the same iterator repeatedly.
    # Otherwise the call in ifilter will reset the iterator by implicitely  calling __iter__.
    # test if iter does the same
    flowgrams_iter = flowgrams.__iter__()
    # prepare input files and commands
    # synchronous client-server communication

    workload = compute_workload(num_cores, num_flows, spread)

    debug_count = 0
    for i in range(num_cores):
        socket = client_sockets[i]
        # send master flowgram to file first
        send_flowgram_to_socket(id, flowgram, socket)

        if(workload[i] < 1):
            # no data left for this poor guy
            save_send(socket, "--END--")
            continue
        else:
            # Then add all others which are still valid, i.e. in ids
            for (k, f) in (izip(range(workload[i]),
                                ifilter(lambda f: f.Name in ids, flowgrams_iter))):
                fc.add(f)
                send_flowgram_to_socket(k, f, socket, trim=False)
                names.append(f.Name)
                debug_count += 1
            # send the termination signal
            save_send(socket, "--END--")

    # asynchronous client-server communication
    # ClientHandlers write data in results
    results = [None] * num_cores
    timing = [0.0 for x in xrange(num_cores)]
    for i in range(num_cores):
        socket = client_sockets[i]
        ClientHandler(socket, i, results, timing)
    loop()
    # end asynchronous loop

    spread = adjust_processing_time(num_cores, workload, timing, epoch)

    # flatten list
    scores = [item for list in results for item in list]

    if (debug_count != len(scores)):
        raise RuntimeError("Something bad has happened! I received less " +
                           "alignment scores %d than there are flowgrams %d. Most likely this "
                           % (len(scores), debug_count) +
                           "means that the alignment program is not setup correctly or corrupted. " +
                           "Please run the test scripts to figure out the cause of the error.")

    return (scores, names, fc)
예제 #10
0
def denoise_per_sample(sff_fps,
                       fasta_fp,
                       tmpoutdir,
                       cluster=False,
                       num_cpus=1,
                       squeeze=True,
                       percent_id=0.97,
                       bail=1,
                       primer="",
                       low_cutoff=3.75,
                       high_cutoff=4.5,
                       log_fp="denoiser.log",
                       low_memory=False,
                       verbose=False,
                       error_profile=DENOISER_DATA_DIR +
                       'FLX_error_profile.dat',
                       max_num_rounds=None,
                       titanium=False):
    """Denoise each sample separately"""

    # abort early if binary is missing
    check_flowgram_ali_exe()

    log_fh = None
    if log_fp:
        # switch of buffering for global log file
        log_fh = open(tmpoutdir + "/" + log_fp, "w", 0)

    # overwrite settings if titanium is set
    # This flag is only used from qiime. Remove after qiime integration
    if titanium:
        error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat"
        low_cutoff = 4
        high_cutoff = 5

    if verbose:
        log_fh.write("Denoiser version: %s\n" % __version__)
        log_fh.write("SFF files: %s\n" % ', '.join(sff_fps))
        log_fh.write("Fasta file: %s\n" % fasta_fp)
        log_fh.write("Cluster: %s\n" % cluster)
        log_fh.write("Num CPUs: %d\n" % num_cpus)
        log_fh.write("Squeeze Seqs: %s\n" % squeeze)
        log_fh.write("tmpdir: %s\n\n" % tmpoutdir)
        log_fh.write("percent_id threshold: %.2f\n" % percent_id)
        log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail)
        log_fh.write("Error profile: %s\n" % error_profile)
        log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds)

    # here we go ...
    sff_files = split_sff(map(open, sff_fps), open(fasta_fp), tmpoutdir)
    combined_mapping = {}
    result_centroids = []
    result_singletons_files = []
    # denoise each sample separately
    for i, sff_file in enumerate(sff_files):
        if not exists(tmpoutdir + ("/%d" % i)):
            makedirs(tmpoutdir + ("/%d" % i))
        out_fp = tmpoutdir + ("/%d/" % i)
        denoise_seqs([sff_file], fasta_fp, out_fp, None, cluster, num_cpus,
                     squeeze, percent_id, bail, primer, low_cutoff,
                     high_cutoff, log_fp, low_memory, verbose, error_profile,
                     max_num_rounds)

        # collect partial results
        this_rounds_mapping = read_denoiser_mapping(
            open(out_fp + "/denoiser_mapping.txt"))
        combined_mapping.update(this_rounds_mapping)
        result_centroids.append(parse_fasta(open(out_fp + "/centroids.fasta")))
        result_singletons_files.append(out_fp + "/singletons.fasta")

    # write the combined files
    store_mapping(combined_mapping, tmpoutdir, "denoiser")
    seqs = chain(*result_centroids)
    fasta_fh = open(tmpoutdir + "/denoised.fasta", "w")
    # write centroids sorted by clustersize
    write_Fasta_from_name_seq_pairs(
        sort_seqs_by_clustersize(seqs, combined_mapping), fasta_fh)
    for singleton_file in result_singletons_files:
        write_Fasta_from_name_seq_pairs(parse_fasta(open(singleton_file, "r")),
                                        fasta_fh)
    fasta_fh.close()

    # return outdir for tests/test_denoiser
    return tmpoutdir
예제 #11
0
def denoise_seqs(sff_fps,
                 fasta_fp,
                 tmpoutdir,
                 preprocess_fp=None,
                 cluster=False,
                 num_cpus=1,
                 squeeze=True,
                 percent_id=0.97,
                 bail=1,
                 primer="",
                 low_cutoff=3.75,
                 high_cutoff=4.5,
                 log_fp="denoiser.log",
                 low_memory=False,
                 verbose=False,
                 error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat',
                 max_num_rounds=None,
                 titanium=False,
                 checkpoint_fp=None):
    """The main routine to denoise flowgrams"""

    # abort if binary is missing
    check_flowgram_ali_exe()

    if verbose:
        # switch of buffering for log file
        log_fh = open(tmpoutdir + "/" + log_fp, "w", 0)
    else:
        log_fh = None

    # overwrite settings if titanium is set
    # This flag is only used from qiime. Remove after qiime integration
    if titanium:
        error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat"
        low_cutoff = 4
        high_cutoff = 5

    if verbose:
        log_fh.write("Denoiser version: %s\n" % __version__)
        log_fh.write("SFF files: %s\n" % ', '.join(sff_fps))
        log_fh.write("Fasta file: %s\n" % fasta_fp)
        log_fh.write("Preprocess dir: %s\n" % preprocess_fp)
        if checkpoint_fp:
            log_fh.write("Resuming denoiser from %s\n" % checkpoint_fp)
        log_fh.write("Primer sequence: %s\n" % primer)
        log_fh.write("Running on cluster: %s\n" % cluster)
        log_fh.write("Num CPUs: %d\n" % num_cpus)
        log_fh.write("Squeeze Seqs: %s\n" % squeeze)
        log_fh.write("tmpdir: %s\n" % tmpoutdir)
        log_fh.write("percent_id threshold: %.2f\n" % percent_id)
        log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail)
        log_fh.write("Low cut-off: %.2f\n" % low_cutoff)
        log_fh.write("High cut-off: %.2f\n" % high_cutoff)
        log_fh.write("Error profile: %s\n" % error_profile)
        log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds)

    # here we go ...
    # Phase I - clean up and truncate input sff
    if (checkpoint_fp):
        if (preprocess_fp):
            # skip preprocessing as we should have data
            # we already have preprocessed data, so use it
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(preprocess_fp)
        else:
            raise ApplicationError(
                "Resuming from checkpoint requires --preprocess option")

    else:
        if (preprocess_fp):
            # we already have preprocessed data, so use it
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(preprocess_fp)
        elif (cluster):
            preprocess_on_cluster(sff_fps,
                                  log_fp,
                                  fasta_fp=fasta_fp,
                                  out_fp=tmpoutdir,
                                  verbose=verbose,
                                  squeeze=squeeze,
                                  primer=primer)
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(tmpoutdir)
        else:
            (deprefixed_sff_fp, l, mapping, seqs) = \
                preprocess(
                    sff_fps, log_fh, fasta_fp=fasta_fp, out_fp=tmpoutdir,
                    verbose=verbose, squeeze=squeeze, primer=primer)

        # preprocessor writes into same file, so better jump to end of file
        if verbose:
            log_fh.close()
            log_fh = open(tmpoutdir + "/" + log_fp, "a", 0)

    # phase II:
    # use prefix map based clustering as initial centroids and greedily
    # add flowgrams to clusters with a low threshold

    (new_sff_file, bestscores, mapping) = \
        greedy_clustering(deprefixed_sff_fp, seqs, mapping, tmpoutdir, l,
                          log_fh, num_cpus=num_cpus, on_cluster=cluster,
                          bail_out=bail, pair_id_thresh=percent_id,
                          threshold=low_cutoff, verbose=verbose,
                          fast_method=not low_memory,
                          error_profile=error_profile,
                          max_num_rounds=max_num_rounds,
                          checkpoint_fp=checkpoint_fp)

    # phase III phase:
    # Assign seqs to nearest existing centroid with high threshold
    secondary_clustering(new_sff_file,
                         mapping,
                         bestscores,
                         log_fh,
                         verbose=verbose,
                         threshold=high_cutoff)
    remove(new_sff_file)
    if (verbose):
        log_fh.write("Finished clustering\n")
        log_fh.write("Writing Clusters\n")
        log_fh.write(make_stats(mapping) + "\n")
    store_clusters(mapping, deprefixed_sff_fp, tmpoutdir)
    store_mapping(mapping, tmpoutdir, "denoiser")
예제 #12
0
def get_flowgram_distances_on_cluster(id,
                                      flowgram,
                                      flowgrams,
                                      fc,
                                      ids,
                                      num_cores,
                                      num_flows,
                                      spread,
                                      client_sockets=[]):
    """Computes distance scores of flowgram to all flowgrams in parser.

    id: The flowgram identifier, also used to name intermediate files

    flowgram: This flowgram is used to filter all the other flowgrams

    flowgrams: iterable filehandle of flowgram file

    fc: a sink of flowgrams, which serves as source in the next round

    ids: list of flowgram ids that should be used from flowgrams

    num_cores: number of cpus

    num_flows: Number of flows in parser

    client_sockets: A list of open sockets for client-server communication

    spread: historical distribution of processing runtimes

    """
    epoch = time()

    check_flowgram_ali_exe()

    qiime_config = load_qiime_config()
    min_per_core = int(qiime_config['denoiser_min_per_core'])
    # if using from future import division this has to be checked,
    # as we want true integer division here

    per_core = max(min_per_core, (num_flows / num_cores) + 1)
    names = []
    scores = []

    # Need to call this here, since we iterate over the same iterator repeatedly.
    # Otherwise the call in ifilter will reset the iterator by implicitely  calling __iter__.
    # test if iter does the same
    flowgrams_iter = flowgrams.__iter__()
    # prepare input files and commands
    # synchronous client-server communication

    workload = compute_workload(num_cores, num_flows, spread)

    debug_count = 0
    for i in range(num_cores):
        socket = client_sockets[i]
        # send master flowgram to file first
        send_flowgram_to_socket(id, flowgram, socket)

        if (workload[i] < 1):
            # no data left for this poor guy
            save_send(socket, "--END--")
            continue
        else:
            # Then add all others which are still valid, i.e. in ids
            for (k,
                 f) in (izip(range(workload[i]),
                             ifilter(lambda f: f.Name in ids,
                                     flowgrams_iter))):
                fc.add(f)
                send_flowgram_to_socket(k, f, socket, trim=False)
                names.append(f.Name)
                debug_count += 1
            # send the termination signal
            save_send(socket, "--END--")

    # asynchronous client-server communication
    # ClientHandlers write data in results
    results = [None] * num_cores
    timing = [0.0 for x in xrange(num_cores)]
    for i in range(num_cores):
        socket = client_sockets[i]
        ClientHandler(socket, i, results, timing)
    loop()
    # end asynchronous loop

    spread = adjust_processing_time(num_cores, workload, timing, epoch)

    # flatten list
    scores = [item for list in results for item in list]

    if (debug_count != len(scores)):
        raise RuntimeError(
            "Something bad has happened! I received less " +
            "alignment scores %d than there are flowgrams %d. Most likely this "
            % (len(scores), debug_count) +
            "means that the alignment program is not setup correctly or corrupted. "
            +
            "Please run the test scripts to figure out the cause of the error."
        )

    return (scores, names, fc)