def get_kmer_index(barcodes_unzipped): """ Args: barcodes_unzipped (str): filename for unzipped barcodes fq Returns kmer_idx (dict): map of kmer to list of line offsets for reads that contain that kmer kmer_counts (dict): map of kmer to absolute counts This method returns a kmer index and counts dict for a random subset of the dataset. The size of the subset attempts to be the minimal number of reads whose kmer spectrum is representative of the data General approach: initialize: get a random chunk of reads based on line offsets compute kmer counts loop: get a new chunk of reads and combine with prevoius chunks compute kmer counts for the new chunk compare kmer counts with previous iteration terminate when: pearsonR >= some cutoff value """ PEARSONR_CUTOFF = 0.999 MIN_ITERS = 10 BUFFER_SIZE = 10000 length = args['barcode_end'] - args['barcode_start'] pool = Pool(processes=args['threads']) read_count = 0 kmer_idx = {} counts_corr_coefs = [] num_reads = [] bc_file = open(barcodes_unzipped, 'rb') read_chunks_iter = IO_utils.get_read_chunks(bc_file, random=True, BUFFER_SIZE=BUFFER_SIZE) chunk_num = 0 while True: try: reads_chunk = next(read_chunks_iter) chunk_num += 1 except StopIteration: break read_count += len(reads_chunk) num_reads.append(read_count) chunk_kmer_indices = pool.map(index_read, reads_chunk) #chunk_kmer_indices is a list of dicts old_kmer_counts = get_kmer_counts(kmer_idx) #kmer counts before updating with chunk_kmer_indexes for element in chunk_kmer_indices: for (key, read_offsets) in element.items(): #read_offsets: [offset1, offset2, offset3 ...] if key not in kmer_idx: kmer_idx[key] = [] kmer_idx[key] = kmer_idx[key] + read_offsets del (chunk_kmer_indices) _ = gc.collect() new_kmer_counts = get_kmer_counts(kmer_idx) #check kmer count correlation counts_corr_coef = get_kmer_count_correlation(old_kmer_counts, new_kmer_counts) counts_corr_coefs.append(counts_corr_coef) print('\t%i reads indexed. Running pearsonr is %f' % \ (read_count, counts_corr_coef)) if(len(counts_corr_coefs) >= MIN_ITERS) and \ (counts_corr_coef > PEARSONR_CUTOFF): break bc_file.close() pool.close() return (kmer_idx, new_kmer_counts, Plot_utils.plot_kmer_subsamp_pearson(output_dir, counts_corr_coefs, num_reads))
def assign_all_reads(params): (consensus_bcs, reads_unzipped, barcodes_unzipped) = params BUFFER_SIZE = 100000 MAX_KMER_SIZE = args['barcode_end'] - args['barcode_start'] MIN_KMER_SIZE = 6 reads_assigned_db, reads_assigned_pipe = IO_utils.initialize_redis_pipeline( ) pool = Pool(processes=args['threads']) #print('\tMapping kmers to consensus barcodes') if args['split_levenshtein']: print( '\tAssigning reads to consensus barcodes using Levenshtein distance' ) else: print( '\tAssigning reads to consensus barcodes using kmer compatability') kmer_map = map_kmers_to_bcs(consensus_bcs, MIN_KMER_SIZE, MAX_KMER_SIZE) read_count = 0 num_unassigned = 0 reads_f = open(reads_unzipped, 'rb') barcodes_f = open(barcodes_unzipped, 'rb') encode = lambda i: str(i).encode('utf-8') encode_tup = lambda i, j: encode(i) + b',' + encode(j) for reads_chunk, barcodes_chunk in zip( IO_utils.get_read_chunks(reads_f, random=False, BUFFER_SIZE=BUFFER_SIZE), IO_utils.get_read_chunks(barcodes_f, random=False, BUFFER_SIZE=BUFFER_SIZE)): read_count += len(reads_chunk) if args['split_levenshtein']: assignments = pool.map( assign_read_levenshtein, zip(repeat(args), repeat(consensus_bcs), reads_chunk, barcodes_chunk)) else: assignments = pool.map( assign_read_kmers, zip(repeat(kmer_map), repeat(MIN_KMER_SIZE), repeat(MAX_KMER_SIZE), reads_chunk, barcodes_chunk)) for (assignment, offset1, offset2) in assignments: if (assignment == 'unassigned'): num_unassigned += 1 #reads_assigned[assignment].append((offset1, offset2)) reads_assigned_pipe.rpush(assignment.encode('utf-8'), encode_tup(offset1, offset2)) reads_assigned_pipe.execute() print('\tProcessed %i reads' % read_count) reads_f.close() barcodes_f.close() pool.close() print('\t%i reads could not be assigned' % num_unassigned) #return pickle_files return reads_assigned_db, reads_assigned_pipe
def assign_all_reads(params): (consensus_bcs, reads_unzipped, barcodes_unzipped) = params BUFFER_SIZE = 10000 PICKLE_SIZE = 1000000 MAX_KMER_SIZE = args['barcode_end'] - args['barcode_start'] MIN_KMER_SIZE = 7 pool = Pool(processes=args['threads']) print('\tMapping kmers to consensus barcodes') kmer_map = map_kmers_to_bcs(consensus_bcs, MIN_KMER_SIZE, MAX_KMER_SIZE) reads_assigned = initialize_reads_assigned(consensus_bcs) print('\tAssigning reads to consensus barcodes') read_count = 0 num_unassigned = 0 reads_f = open(reads_unzipped, 'rb') barcodes_f = open(barcodes_unzipped, 'rb') pickle_files = [] for reads_chunk, barcodes_chunk in zip( IO_utils.get_read_chunks(reads_f, random=False, BUFFER_SIZE=BUFFER_SIZE), IO_utils.get_read_chunks(barcodes_f, random=False, BUFFER_SIZE=BUFFER_SIZE)): read_count += len(reads_chunk) if not args['split_levenshtein']: assignments = pool.map( assign_read_kmers, zip(repeat(kmer_map), repeat(MIN_KMER_SIZE), repeat(MAX_KMER_SIZE), reads_chunk, barcodes_chunk)) else: #this is a pipeline for reviwer expts only #works quite poorly, see simulation results assignments = pool.map( assign_read_levenshtein, zip(repeat(consensus_bcs), reads_chunk, barcodes_chunk)) for (assignment, offset1, offset2) in assignments: if (assignment == 'unassigned'): num_unassigned += 1 reads_assigned[assignment].append((offset1, offset2)) print('\tProcessed %i reads' % read_count) #pickle dump read assignments every 10m reads if read_count % PICKLE_SIZE == 0: pickle_files.append(IO_utils.write_to_pickle(reads_assigned)) reads_assigned = initialize_reads_assigned(consensus_bcs) pickle_files.append(IO_utils.write_to_pickle(reads_assigned)) reads_f.close() barcodes_f.close() pool.close() print('\t%i reads could not be assigned' % num_unassigned) return pickle_files