예제 #1
0
def get_flowgram_distances_on_cluster(
        id, flowgram, flowgrams, fc, ids, num_cores,
        num_flows, spread, client_sockets=[]):
    """Computes distance scores of flowgram to all flowgrams in parser.

    id: The flowgram identifier, also used to name intermediate files

    flowgram: This flowgram is used to filter all the other flowgrams

    flowgrams: iterable filehandle of flowgram file

    fc: a sink of flowgrams, which serves as source in the next round

    ids: list of flowgram ids that should be used from flowgrams

    num_cores: number of cpus

    num_flows: Number of flows in parser

    client_sockets: A list of open sockets for client-server communication

    spread: historical distribution of processing runtimes

    """
    epoch = time()

    check_flowgram_ali_exe()

    qiime_config = load_qiime_config()
    min_per_core = int(qiime_config['denoiser_min_per_core'])
    # if using from future import division this has to be checked,
    # as we want true integer division here

    per_core = max(min_per_core, (num_flows / num_cores) + 1)
    names = []
    scores = []

    # Need to call this here, since we iterate over the same iterator repeatedly.
    # Otherwise the call in ifilter will reset the iterator by implicitely  calling __iter__.
    # test if iter does the same
    flowgrams_iter = flowgrams.__iter__()
    # prepare input files and commands
    # synchronous client-server communication

    workload = compute_workload(num_cores, num_flows, spread)

    debug_count = 0
    for i in range(num_cores):
        socket = client_sockets[i]
        # send master flowgram to file first
        send_flowgram_to_socket(id, flowgram, socket)

        if(workload[i] < 1):
            # no data left for this poor guy
            save_send(socket, "--END--")
            continue
        else:
            # Then add all others which are still valid, i.e. in ids
            for (k, f) in (izip(range(workload[i]),
                                ifilter(lambda f: f.Name in ids, flowgrams_iter))):
                fc.add(f)
                send_flowgram_to_socket(k, f, socket, trim=False)
                names.append(f.Name)
                debug_count += 1
            # send the termination signal
            save_send(socket, "--END--")

    # asynchronous client-server communication
    # ClientHandlers write data in results
    results = [None] * num_cores
    timing = [0.0 for x in xrange(num_cores)]
    for i in range(num_cores):
        socket = client_sockets[i]
        ClientHandler(socket, i, results, timing)
    loop()
    # end asynchronous loop

    spread = adjust_processing_time(num_cores, workload, timing, epoch)

    # flatten list
    scores = [item for list in results for item in list]

    if (debug_count != len(scores)):
        raise RuntimeError("Something bad has happened! I received less " +
                           "alignment scores %d than there are flowgrams %d. Most likely this "
                           % (len(scores), debug_count) +
                           "means that the alignment program is not setup correctly or corrupted. " +
                           "Please run the test scripts to figure out the cause of the error.")

    return (scores, names, fc)
예제 #2
0
def get_flowgram_distances_on_cluster(id,
                                      flowgram,
                                      flowgrams,
                                      fc,
                                      ids,
                                      num_cores,
                                      num_flows,
                                      spread,
                                      client_sockets=[]):
    """Computes distance scores of flowgram to all flowgrams in parser.

    id: The flowgram identifier, also used to name intermediate files

    flowgram: This flowgram is used to filter all the other flowgrams

    flowgrams: iterable filehandle of flowgram file

    fc: a sink of flowgrams, which serves as source in the next round

    ids: list of flowgram ids that should be used from flowgrams

    num_cores: number of cpus

    num_flows: Number of flows in parser

    client_sockets: A list of open sockets for client-server communication

    spread: historical distribution of processing runtimes

    """
    epoch = time()

    check_flowgram_ali_exe()

    qiime_config = load_qiime_config()
    min_per_core = int(qiime_config['denoiser_min_per_core'])
    # if using from future import division this has to be checked,
    # as we want true integer division here

    per_core = max(min_per_core, (num_flows / num_cores) + 1)
    names = []
    scores = []

    # Need to call this here, since we iterate over the same iterator repeatedly.
    # Otherwise the call in ifilter will reset the iterator by implicitely  calling __iter__.
    # test if iter does the same
    flowgrams_iter = flowgrams.__iter__()
    # prepare input files and commands
    # synchronous client-server communication

    workload = compute_workload(num_cores, num_flows, spread)

    debug_count = 0
    for i in range(num_cores):
        socket = client_sockets[i]
        # send master flowgram to file first
        send_flowgram_to_socket(id, flowgram, socket)

        if (workload[i] < 1):
            # no data left for this poor guy
            save_send(socket, "--END--")
            continue
        else:
            # Then add all others which are still valid, i.e. in ids
            for (k,
                 f) in (izip(range(workload[i]),
                             ifilter(lambda f: f.Name in ids,
                                     flowgrams_iter))):
                fc.add(f)
                send_flowgram_to_socket(k, f, socket, trim=False)
                names.append(f.Name)
                debug_count += 1
            # send the termination signal
            save_send(socket, "--END--")

    # asynchronous client-server communication
    # ClientHandlers write data in results
    results = [None] * num_cores
    timing = [0.0 for x in xrange(num_cores)]
    for i in range(num_cores):
        socket = client_sockets[i]
        ClientHandler(socket, i, results, timing)
    loop()
    # end asynchronous loop

    spread = adjust_processing_time(num_cores, workload, timing, epoch)

    # flatten list
    scores = [item for list in results for item in list]

    if (debug_count != len(scores)):
        raise RuntimeError(
            "Something bad has happened! I received less " +
            "alignment scores %d than there are flowgrams %d. Most likely this "
            % (len(scores), debug_count) +
            "means that the alignment program is not setup correctly or corrupted. "
            +
            "Please run the test scripts to figure out the cause of the error."
        )

    return (scores, names, fc)