def align_and_pickle_mappings(): print 'loading ref_hash pickle...' pickle_filename = 'ref_hash_{}_{}.pkl'.format(c.DATASET, c.KEY_SIZE) ref_hash = load_hash_pickle(pickle_filename) print 'loading ref...' ref_file = c.REF_FILE ref = utils.read_reference('{}/{}/{}'.format(c.DATA_PATH, c.DATASET, ref_file)) print 'loading reads...' reads_file = c.READS_FILE reads = utils.read_reads('{}/{}/{}'.format(c.DATA_PATH, c.DATASET, reads_file)) print 'loaded reads' print 'aligning {}...'.format(c.DATASET) start = time.time() alignments = [align_paired_read(pr, ref, ref_hash) for pr in reads] alignments = [x for x in alignments if x] print 'alignment complete, elapsed: {}'.format(time.time() - start) directory = '{}/{}/{}'.format(c.DATA_PATH, c.DATASET, 'alignments/') if not os.path.exists(directory): print 'creating folder {}'.format(directory) os.makedirs(directory) start = time.time() alignments_pickle_path = 'alignments_{}_{}.pkl'.format(c.DATASET, c.KEY_SIZE) pickle.dump(alignments, open(alignments_pickle_path, 'wb')) print '{} alignment pickled in {}'.format(reads_file, time.time() - start)
if first_line: first_line = False continue write_buffer += line.strip() + '\n' line_count += 1 if line_count >= max_num_lines: with open('data/{}/reads_split/part_{}.txt'.format(c.DATASET, file_count),'w') as w: w.write(write_buffer) print 'wrote part {}'.format(file_count) file_count += 1 write_buffer = '' line_count = 0 if len(write_buffer) > 0: with open('data/{}/reads_split/part_{}.txt'.format(c.DATASET, file_count),'w') as w: w.write(write_buffer) print 'wrote part {}'.format(file_count) if __name__ == '__main__': print 'splitting reads...' directory = '{}/{}/{}'.format(c.DATA_PATH, c.DATASET, 'reads_split/') if not os.path.exists(directory): print 'creating folder {}'.format(directory) os.makedirs(directory) ref = utils.read_reference('data/{}/{}'.format(c.DATASET,c.REF_FILE)) split_reads('data/{}/{}'.format(c.DATASET, c.READS_FILE)) print 'reads split'
# # l = SortedListWithKey(key=lambda val:val[0]) # for fn in fns: # alignments = pickle.load(open('{}{}'.format(directory,fn), 'rb')) # alignments = [item for sublist in alignments for item in sublist] # l.update(alignments) # print 'done {}'.format(fn) # return l if __name__ == '__main__': ref = utils.read_reference() stretches = get_nonperfect_stretches(ref) print stretches #print 'sorting...' #sl = sorted(nprt_list) #print 'sorted: {}'.format(sl) pickle.dump(stretches, file('stretches_{}.pkl'.format(c.DATASET), 'wb')) #msgpack.dump(stretches, file('stretches_{}.msg'.format(c.DATASET), 'wb')) #pickle.dump(sl, file('sorted_nprt_{}.pkl'.format(c.DATASET), 'wb')) print 'DONE' # start = time.time()
alignments_pickle_path = 'alignments_{}_{}.pkl'.format(c.DATASET, c.KEY_SIZE) pickle.dump(alignments, open(alignments_pickle_path, 'wb')) print '{} alignment pickled in {}'.format(reads_file, time.time() - start) if __name__ == '__main__': args = sys.argv[1:] if len(args) == 2: FILE_INDEX_BEGIN = int(args[0]) FILE_INDEX_END = int(args[1]) else: FILE_INDEX_BEGIN = 0 FILE_INDEX_END = len(os.listdir('data/{}/reads_split/'.format(c.DATASET))) print 'Processing files {} through {}'.format(FILE_INDEX_BEGIN, FILE_INDEX_END) print 'loading ref_hash pickle...' pickle_filename = 'ref_hash_{}_{}.pkl'.format(c.DATASET, c.KEY_SIZE) ref_hash = load_hash_pickle(pickle_filename) print 'loading ref...' ref_file = c.REF_FILE ref = utils.read_reference('{}/{}/{}'.format(c.DATA_PATH, c.DATASET, ref_file)) #align_and_pickle_mappings() for file_index in xrange(FILE_INDEX_BEGIN, FILE_INDEX_END): align_and_pickle_mappings_split(file_index, ref, ref_hash) progress = (file_index - FILE_INDEX_BEGIN)*100.0/(FILE_INDEX_END - FILE_INDEX_BEGIN) print 'STATUS: {0:.2f}% complete \n'.format(progress)