remaining_time = time_passed / count * (len(paired_end_reads) -
                                                    count)
            print 'Approximately {:.3} minutes remaining'.format(
                remaining_time)
    return alignments, genome_aligned_reads


if __name__ == "__main__":
    genome_name = 'practice_W_1'
    input_folder = './{}'.format(genome_name)
    chr_name = '{}_chr_1'.format(genome_name)
    reads_fn_end = 'reads_{}.txt'.format(chr_name)
    reads_fn = join(input_folder, reads_fn_end)
    ref_fn_end = 'ref_{}.txt'.format(chr_name)
    ref_fn = join(input_folder, ref_fn_end)
    key_length = 7
    start = time.clock()
    reads = read_reads(reads_fn)
    # If you want to speed it up, cut down the number of reads by
    # changing the line to reads = read_reads(reads_fn)[:<x>] where <x>
    # is the number of reads you want to work with.
    genome_hash_table = build_hash_and_pickle(ref_fn, key_length)
    ref = read_reference(ref_fn)
    genome_aligned_reads, alignments = hashing_algorithm(
        reads, genome_hash_table)
    # print genome_aligned_reads
    # print alignments
    output_str = pretty_print_aligned_reads_with_ref(genome_aligned_reads,
                                                     alignments, ref)
    print output_str[:5000]
            read_alignment_locations.append(min_mismatch_location)
            output_read_pair.append(read)
            # # Note that there are some huge potential problems here.

        all_read_alignment_locations.append(read_alignment_locations)
        output_read_pairs.append(output_read_pair)
    return all_read_alignment_locations, output_read_pairs


if __name__ == "__main__":
    data_folder = 'hw1_W_2'
    input_folder = join('../data/', data_folder)
    f_base = '{}_chr_1'.format(data_folder)
    reads_fn = join(input_folder, 'reads_{}.txt'.format(f_base))
    start = time.clock()
    input_reads = read_reads(reads_fn)
    # This will take a while; you can use an array slice for example:
    #
    #   input_reads = reads[:300]
    #
    # to generate some data quickly.

    reference_fn = join(input_folder, 'ref_{}.txt'.format(f_base))
    reference = read_reference(reference_fn)
    alignments, reads = trivial_algorithm(input_reads, reference)
    print alignments
    print reads
    output_str = pretty_print_aligned_reads_with_ref(reads, alignments, reference)
    output_fn = join(input_folder, 'aligned_{}.txt'.format(f_base))
    with(open(output_fn, 'w')) as output_file:
        output_file.write(output_str)
示例#3
0
                        n_mismatches = sum(mismatches)
                        if n_mismatches < max_mismatches:
                            min_mismatch_location = i - part
                            read = rev_read

            read_alignment_locations.append(min_mismatch_location)
            output_read_pair.append(read)
        all_read_alignment_locations.append(read_alignment_locations)
        output_read_pairs.append(output_read_pair)
    return all_read_alignment_locations, output_read_pairs


if __name__ == "__main__":
    data_folder = 'practice_W_1'
    input_folder = join('./', data_folder)
    f_base = '{}_chr_1'.format(data_folder)
    reads_fn = join(input_folder, 'reads_{}.txt'.format(f_base))
    start = time.clock()
    input_reads = read_reads(reads_fn)

    reference_fn = join(input_folder, 'ref_{}.txt'.format(f_base))
    reference = read_reference(reference_fn)
    alignments, reads = faster_algorithm(input_reads, reference)
    print alignments
    print reads
    output_str = pretty_print_aligned_reads_with_ref(reads, alignments,
                                                     reference)
    output_fn = join(input_folder, 'aligned_{}.txt'.format(f_base))
    with (open(output_fn, 'w')) as output_file:
        output_file.write(output_str)
示例#4
0
        consensus_string += consensus_base
    return consensus_string


if __name__ == "__main__":
    genome_name = 'hw2undergrad_E_2'
    #genome_name = 'practice_E_1'
    input_folder = '../data/{}'.format(genome_name)
    chr_name = '{}_chr_1'.format(genome_name)
    reads_fn_end = 'reads_{}.txt'.format(chr_name)
    reads_fn = join(input_folder, reads_fn_end)
    ref_fn_end = 'ref_{}.txt'.format(chr_name)
    ref_fn = join(input_folder, ref_fn_end)
    key_length = 8
    start = time.clock()
    reads = read_reads(reads_fn[:1000])
    # If you want to speed it up, cut down the number of reads by
    # changing the line to reads = read_reads(reads_fn)[:<x>] where <x>
    # is the number of reads you want to work with.
    #genome_hash_table = build_hash_and_pickle(ref_fn, key_length)
    #reference = read_reference(ref_fn)

    #genome_aligned_reads, alignments = hashing_algorithm(reads, genome_hash_table)
    #g_aligned_reads = file('g_aligned_reads.txt', 'w'), 'aligned_reads'
    #print genome_aligned_reads >> g_aligned_reads
    #alignments_file = file('alignments.txt', 'w'), 'alignments'
    #print alignments >> alignments_file
    # print genome_aligned_reads
    # print alignments
    #output_str = pretty_print_aligned_reads_with_ref(genome_aligned_reads, alignments, reference)
    # print output_str[:5000]
def read_assembly_reads(read_fn):
    reads = read_reads(read_fn)
    output_reads = [_[0] for _ in reads]
    # Only taking one end of the read works okay, but
    # this is an obvious area for improvement.
    return output_reads
def read_assembly_reads(read_fn):
    reads = read_reads(read_fn)
    output_reads = [_[0] for _ in reads]
    # Only taking one end of the read works okay, but
    # this is an obvious area for improvement.
    return output_reads
def read_assembly_reads(read_fn):
    return read_reads(read_fn)
        genome_aligned_reads.append(genome_aligned_read)
        count += 1
        if count % 100 == 0:
            time_passed = (time.clock()-start)/60
            print '{} reads aligned'.format(count), 'in {:.3} minutes'.format(time_passed)
            remaining_time = time_passed/count*(len(paired_end_reads)-count)
            print 'Approximately {:.3} minutes remaining'.format(remaining_time)
    return alignments, genome_aligned_reads

if __name__ == "__main__":
    genome_name = 'practice_W_1'
    input_folder = './{}'.format(genome_name)
    chr_name = '{}_chr_1'.format(genome_name)
    reads_fn_end = 'reads_{}.txt'.format(chr_name)
    reads_fn = join(input_folder, reads_fn_end)
    ref_fn_end = 'ref_{}.txt'.format(chr_name)
    ref_fn = join(input_folder, ref_fn_end)
    key_length = 7
    start = time.clock()
    reads = read_reads(reads_fn)
    # If you want to speed it up, cut down the number of reads by
    # changing the line to reads = read_reads(reads_fn)[:<x>] where <x>
    # is the number of reads you want to work with.
    genome_hash_table = build_hash_and_pickle(ref_fn, key_length)
    ref = read_reference(ref_fn)
    genome_aligned_reads, alignments = hashing_algorithm(reads, genome_hash_table)
    # print genome_aligned_reads
    # print alignments
    output_str = pretty_print_aligned_reads_with_ref(genome_aligned_reads, alignments, ref)
    print output_str[:5000]