예제 #1
0
    def test_container(self):
        """FlowgramContainerFile works as expected"""

        fc = FlowgramCollection({"a": "1.0 0.0 0.0 1.0 1.0 1.2 1.2 0.8", "b": "1.2 1.0 0.0 0.8 1.2 2.4 1.0 0.0"})

        f_container = FlowgramContainerFile(header)

        for f in fc:
            f_container.add(f)

        for f_obs, f_exp in zip(f_container, fc):
            self.assertEqual(str(f_obs), str(f_exp))

        # adding after iter started raises errror
        self.assertRaises(ValueError, f_container.add, f_obs)
예제 #2
0
   def test_container(self):
      """FlowgramContainerFile works as expected"""

      fc = FlowgramCollection({'a':'1.0 0.0 0.0 1.0 1.0 1.2 1.2 0.8',
                                'b':'1.2 1.0 0.0 0.8 1.2 2.4 1.0 0.0'})

      f_container = FlowgramContainerFile(header)
      
      for f in fc:
         f_container.add(f)
         
      for f_obs, f_exp in zip(f_container,fc):
         self.assertEqual(str(f_obs), str(f_exp))
         
      # adding after iter started raises errror
      self.assertRaises(ValueError, f_container.add,f_obs)
예제 #3
0
def filter_with_flowgram(id,
                         flowgram,
                         flowgrams,
                         header,
                         ids,
                         num_flows,
                         bestscores,
                         log_fh,
                         outdir="/tmp/",
                         threshold=3.75,
                         num_cpus=32,
                         fast_method=True,
                         on_cluster=False,
                         mapping=None,
                         spread=[],
                         verbose=False,
                         pair_id_thresh=0.97,
                         client_sockets=[],
                         error_profile=DENOISER_DATA_DIR +
                         'FLX_error_profile.dat'):
    """Filter all files in flows_filename with flowgram and split according to threshold.

    id: The flowgram identifier of the master flowgram of this round

    flowgram: This flowgram is used to filter all the other flowgrams

    flowgrams: iterator containing the flowgrams to be filtered

    header: a valid sff.txt header

    ids: this list marks the active flowgrams, i.e. flowgrams that are unclustered

    num_flows: Number of flows remaining in the current round

    bestscores: dictionary that stores for each unclustered flowgram the best
                score it has to to one of the centroids previously seen
                and the id of the centroid. Used in the second denoising phase.

    outdir: directory where intermediate and result files go

    threshold: Filtering threshold

    num_cpus: number of cpus to run on, if on_cluster == True

    fast_method: Boolean value for fast denoising with lots of memory

    on_cluster: Boolean flag for local vs cluster

    mapping: the current cluster mapping

    spread: worker processing throughput

    error_profile: Path to error profile *.dat file


    Implementation detail:
    The iterator behind 'flowgrams' is big and thus we want to keep its traversals
    at a minimum. The naive implementation of this filter function would traverse the
    iterator once to create the input file for the alignment routine, then a second
    time to do the actual filtering. To get rid of the second run through the iterator,
    we keep a list (in fact a dict) of active 'ids' and do the filtering only in the next
    round. A cleaner but still fast solution would be great, as this definitly poses a
    pitfall for future modifications.

    Returns filename of file containing all non-filtered flows and the number of flows
    """
    if verbose:
        log_fh.write("Filtering with %s: %d flowgrams\n" % (id, num_flows))

    # set up the flowgram storage
    if (not fast_method):
        fc = FlowgramContainerFile(header, outdir)
    else:
        fc = FlowgramContainerArray()

    # calculate distance scores
    if on_cluster:
        (scores, names, flowgrams) =\
            get_flowgram_distances_on_cluster(
                id, flowgram, flowgrams, fc, ids, num_cpus,
                num_flows, spread=spread, client_sockets=client_sockets)
    else:
        (scores, names, flowgrams) =\
            get_flowgram_distances(
                id, flowgram, flowgrams, fc, ids, outdir=outdir,
                error_profile=error_profile)

    # shortcut for non-matching flowgrams
    survivors = filter(
        lambda a_b: a_b[0] < threshold or a_b[1] >= pair_id_thresh, scores)
    if (len(survivors) == 0):
        # put it in its own cluster
        # and remove it from any further searches
        if (id in bestscores):
            del (bestscores[id])
        del (ids[id])
        return (flowgrams, num_flows - 1)

    # Do the filtering
    non_clustered_ctr = 0
    for ((score, pair_id), name) in zip(scores, names):
        if (score < threshold or name == id or pair_id >= pair_id_thresh):
            # make sure the original flowgram gets into this cluster
            del (ids[name])
            if (name in bestscores):
                del (bestscores[name])
            if (id != name):
                # update the mapping information
                mapping[id].extend(mapping[name])
                mapping[id].append(name)
                # delete the old cluster from the mapping
                del (mapping[name])
        else:
            non_clustered_ctr += 1
            # keep track of the best match of this guy to any centroid
            if (name not in bestscores or score < bestscores[name][1]):
                bestscores[name] = (id, score)

    # Some extra safety that we are not missing anything
    if (len(ids) != non_clustered_ctr or len(bestscores) != non_clustered_ctr):
        raise ApplicationError("filterWithFlowgram failed")

    return (flowgrams, non_clustered_ctr)