def test_container(self): """FlowgramContainerFile works as expected""" fc = FlowgramCollection({"a": "1.0 0.0 0.0 1.0 1.0 1.2 1.2 0.8", "b": "1.2 1.0 0.0 0.8 1.2 2.4 1.0 0.0"}) f_container = FlowgramContainerFile(header) for f in fc: f_container.add(f) for f_obs, f_exp in zip(f_container, fc): self.assertEqual(str(f_obs), str(f_exp)) # adding after iter started raises errror self.assertRaises(ValueError, f_container.add, f_obs)
def test_container(self): """FlowgramContainerFile works as expected""" fc = FlowgramCollection({'a':'1.0 0.0 0.0 1.0 1.0 1.2 1.2 0.8', 'b':'1.2 1.0 0.0 0.8 1.2 2.4 1.0 0.0'}) f_container = FlowgramContainerFile(header) for f in fc: f_container.add(f) for f_obs, f_exp in zip(f_container,fc): self.assertEqual(str(f_obs), str(f_exp)) # adding after iter started raises errror self.assertRaises(ValueError, f_container.add,f_obs)
def filter_with_flowgram(id, flowgram, flowgrams, header, ids, num_flows, bestscores, log_fh, outdir="/tmp/", threshold=3.75, num_cpus=32, fast_method=True, on_cluster=False, mapping=None, spread=[], verbose=False, pair_id_thresh=0.97, client_sockets=[], error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat'): """Filter all files in flows_filename with flowgram and split according to threshold. id: The flowgram identifier of the master flowgram of this round flowgram: This flowgram is used to filter all the other flowgrams flowgrams: iterator containing the flowgrams to be filtered header: a valid sff.txt header ids: this list marks the active flowgrams, i.e. flowgrams that are unclustered num_flows: Number of flows remaining in the current round bestscores: dictionary that stores for each unclustered flowgram the best score it has to to one of the centroids previously seen and the id of the centroid. Used in the second denoising phase. outdir: directory where intermediate and result files go threshold: Filtering threshold num_cpus: number of cpus to run on, if on_cluster == True fast_method: Boolean value for fast denoising with lots of memory on_cluster: Boolean flag for local vs cluster mapping: the current cluster mapping spread: worker processing throughput error_profile: Path to error profile *.dat file Implementation detail: The iterator behind 'flowgrams' is big and thus we want to keep its traversals at a minimum. The naive implementation of this filter function would traverse the iterator once to create the input file for the alignment routine, then a second time to do the actual filtering. To get rid of the second run through the iterator, we keep a list (in fact a dict) of active 'ids' and do the filtering only in the next round. A cleaner but still fast solution would be great, as this definitly poses a pitfall for future modifications. Returns filename of file containing all non-filtered flows and the number of flows """ if verbose: log_fh.write("Filtering with %s: %d flowgrams\n" % (id, num_flows)) # set up the flowgram storage if (not fast_method): fc = FlowgramContainerFile(header, outdir) else: fc = FlowgramContainerArray() # calculate distance scores if on_cluster: (scores, names, flowgrams) =\ get_flowgram_distances_on_cluster( id, flowgram, flowgrams, fc, ids, num_cpus, num_flows, spread=spread, client_sockets=client_sockets) else: (scores, names, flowgrams) =\ get_flowgram_distances( id, flowgram, flowgrams, fc, ids, outdir=outdir, error_profile=error_profile) # shortcut for non-matching flowgrams survivors = filter( lambda a_b: a_b[0] < threshold or a_b[1] >= pair_id_thresh, scores) if (len(survivors) == 0): # put it in its own cluster # and remove it from any further searches if (id in bestscores): del (bestscores[id]) del (ids[id]) return (flowgrams, num_flows - 1) # Do the filtering non_clustered_ctr = 0 for ((score, pair_id), name) in zip(scores, names): if (score < threshold or name == id or pair_id >= pair_id_thresh): # make sure the original flowgram gets into this cluster del (ids[name]) if (name in bestscores): del (bestscores[name]) if (id != name): # update the mapping information mapping[id].extend(mapping[name]) mapping[id].append(name) # delete the old cluster from the mapping del (mapping[name]) else: non_clustered_ctr += 1 # keep track of the best match of this guy to any centroid if (name not in bestscores or score < bestscores[name][1]): bestscores[name] = (id, score) # Some extra safety that we are not missing anything if (len(ids) != non_clustered_ctr or len(bestscores) != non_clustered_ctr): raise ApplicationError("filterWithFlowgram failed") return (flowgrams, non_clustered_ctr)