remaining_time = time_passed / count * (len(paired_end_reads) - count) print 'Approximately {:.3} minutes remaining'.format( remaining_time) return alignments, genome_aligned_reads if __name__ == "__main__": genome_name = 'practice_W_1' input_folder = './{}'.format(genome_name) chr_name = '{}_chr_1'.format(genome_name) reads_fn_end = 'reads_{}.txt'.format(chr_name) reads_fn = join(input_folder, reads_fn_end) ref_fn_end = 'ref_{}.txt'.format(chr_name) ref_fn = join(input_folder, ref_fn_end) key_length = 7 start = time.clock() reads = read_reads(reads_fn) # If you want to speed it up, cut down the number of reads by # changing the line to reads = read_reads(reads_fn)[:<x>] where <x> # is the number of reads you want to work with. genome_hash_table = build_hash_and_pickle(ref_fn, key_length) ref = read_reference(ref_fn) genome_aligned_reads, alignments = hashing_algorithm( reads, genome_hash_table) # print genome_aligned_reads # print alignments output_str = pretty_print_aligned_reads_with_ref(genome_aligned_reads, alignments, ref) print output_str[:5000]
read_alignment_locations.append(min_mismatch_location) output_read_pair.append(read) # # Note that there are some huge potential problems here. all_read_alignment_locations.append(read_alignment_locations) output_read_pairs.append(output_read_pair) return all_read_alignment_locations, output_read_pairs if __name__ == "__main__": data_folder = 'hw1_W_2' input_folder = join('../data/', data_folder) f_base = '{}_chr_1'.format(data_folder) reads_fn = join(input_folder, 'reads_{}.txt'.format(f_base)) start = time.clock() input_reads = read_reads(reads_fn) # This will take a while; you can use an array slice for example: # # input_reads = reads[:300] # # to generate some data quickly. reference_fn = join(input_folder, 'ref_{}.txt'.format(f_base)) reference = read_reference(reference_fn) alignments, reads = trivial_algorithm(input_reads, reference) print alignments print reads output_str = pretty_print_aligned_reads_with_ref(reads, alignments, reference) output_fn = join(input_folder, 'aligned_{}.txt'.format(f_base)) with(open(output_fn, 'w')) as output_file: output_file.write(output_str)
n_mismatches = sum(mismatches) if n_mismatches < max_mismatches: min_mismatch_location = i - part read = rev_read read_alignment_locations.append(min_mismatch_location) output_read_pair.append(read) all_read_alignment_locations.append(read_alignment_locations) output_read_pairs.append(output_read_pair) return all_read_alignment_locations, output_read_pairs if __name__ == "__main__": data_folder = 'practice_W_1' input_folder = join('./', data_folder) f_base = '{}_chr_1'.format(data_folder) reads_fn = join(input_folder, 'reads_{}.txt'.format(f_base)) start = time.clock() input_reads = read_reads(reads_fn) reference_fn = join(input_folder, 'ref_{}.txt'.format(f_base)) reference = read_reference(reference_fn) alignments, reads = faster_algorithm(input_reads, reference) print alignments print reads output_str = pretty_print_aligned_reads_with_ref(reads, alignments, reference) output_fn = join(input_folder, 'aligned_{}.txt'.format(f_base)) with (open(output_fn, 'w')) as output_file: output_file.write(output_str)
consensus_string += consensus_base return consensus_string if __name__ == "__main__": genome_name = 'hw2undergrad_E_2' #genome_name = 'practice_E_1' input_folder = '../data/{}'.format(genome_name) chr_name = '{}_chr_1'.format(genome_name) reads_fn_end = 'reads_{}.txt'.format(chr_name) reads_fn = join(input_folder, reads_fn_end) ref_fn_end = 'ref_{}.txt'.format(chr_name) ref_fn = join(input_folder, ref_fn_end) key_length = 8 start = time.clock() reads = read_reads(reads_fn[:1000]) # If you want to speed it up, cut down the number of reads by # changing the line to reads = read_reads(reads_fn)[:<x>] where <x> # is the number of reads you want to work with. #genome_hash_table = build_hash_and_pickle(ref_fn, key_length) #reference = read_reference(ref_fn) #genome_aligned_reads, alignments = hashing_algorithm(reads, genome_hash_table) #g_aligned_reads = file('g_aligned_reads.txt', 'w'), 'aligned_reads' #print genome_aligned_reads >> g_aligned_reads #alignments_file = file('alignments.txt', 'w'), 'alignments' #print alignments >> alignments_file # print genome_aligned_reads # print alignments #output_str = pretty_print_aligned_reads_with_ref(genome_aligned_reads, alignments, reference) # print output_str[:5000]
def read_assembly_reads(read_fn): reads = read_reads(read_fn) output_reads = [_[0] for _ in reads] # Only taking one end of the read works okay, but # this is an obvious area for improvement. return output_reads
def read_assembly_reads(read_fn): return read_reads(read_fn)
genome_aligned_reads.append(genome_aligned_read) count += 1 if count % 100 == 0: time_passed = (time.clock()-start)/60 print '{} reads aligned'.format(count), 'in {:.3} minutes'.format(time_passed) remaining_time = time_passed/count*(len(paired_end_reads)-count) print 'Approximately {:.3} minutes remaining'.format(remaining_time) return alignments, genome_aligned_reads if __name__ == "__main__": genome_name = 'practice_W_1' input_folder = './{}'.format(genome_name) chr_name = '{}_chr_1'.format(genome_name) reads_fn_end = 'reads_{}.txt'.format(chr_name) reads_fn = join(input_folder, reads_fn_end) ref_fn_end = 'ref_{}.txt'.format(chr_name) ref_fn = join(input_folder, ref_fn_end) key_length = 7 start = time.clock() reads = read_reads(reads_fn) # If you want to speed it up, cut down the number of reads by # changing the line to reads = read_reads(reads_fn)[:<x>] where <x> # is the number of reads you want to work with. genome_hash_table = build_hash_and_pickle(ref_fn, key_length) ref = read_reference(ref_fn) genome_aligned_reads, alignments = hashing_algorithm(reads, genome_hash_table) # print genome_aligned_reads # print alignments output_str = pretty_print_aligned_reads_with_ref(genome_aligned_reads, alignments, ref) print output_str[:5000]