def build_kmer_distribution(datafile, kmer_patterns, sampling_proportion, num_processes, builddir, reverse_complement, pattern_window_length, input_driver_config): if os.path.exists(get_save_filename(datafile, builddir)): print("build_kmer_distribution- skipping %s as already done"%datafile) distob = Distribution.load(get_save_filename(datafile, builddir)) distob.summary() else: filetype = get_file_type(datafile) distob = Distribution([datafile], num_processes) distob.interval_locator_parameters = (None,) distob.interval_locator_funcs = (bin_discrete_value,) distob.assignments_files = ("kmer_binning.txt",) distob.file_to_stream_func = seq_from_sequence_file distob.file_to_stream_func_xargs = [filetype,sampling_proportion] distob.weight_value_provider_func = kmer_count_from_sequence distob.weight_value_provider_func_xargs = [reverse_complement, pattern_window_length, 1] + kmer_patterns if filetype == ".cnt": print "DEBUG setting methods for count file" distob.file_to_stream_func = tag_count_from_tag_count_file distob.file_to_stream_func_xargs = [input_driver_config,sampling_proportion] distob.weight_value_provider_func = kmer_count_from_tag_count #distdata = build(distob, use="singlethread") distdata = build(distob, proc_pool_size=num_processes) distob.save(get_save_filename(datafile, builddir)) print "Distribution %s has %d points distributed over %d intervals, stored in %d parts"%(get_save_filename(datafile, builddir), distob.point_weight, len(distdata), len(distob.part_dict)) return get_save_filename(datafile, builddir)
def build_tax_distribution(datafile): distob = Distribution([datafile], 1) distob.file_to_stream_func = my_top_hit_provider #distob.DEBUG = True distob.file_to_stream_func_xargs = [0,7,6] # i.e. pick out first field, then kingdom, comnames distob.interval_locator_funcs = [bin_discrete_value, bin_discrete_value] distdata = build(distob,"singlethread") distob.save("%s.pickle"%datafile) return distdata