def generate_runlength_frequency_matrix(runlength_ref_sequence_path, assembly_vs_ref_bam_path, runlength_ref_sequences, runlength_read_data): """ Take an alignment of RLE sequences (in BAM format, using minimap as an aligner) in combination with the series of lengths (which have been excluded from the BAM) and aligned observations from Benedicts' model to generate a matrix of true vs observed lengths. :param runlength_ref_sequence_path: :param assembly_vs_ref_bam_path: :param runlength_ref_sequences: :param runlength_read_data: :return: """ for chromosome_name in runlength_ref_sequences: shape = [2,4,MAX_RUNLENGTH+1,MAX_RUNLENGTH+1] matrix = numpy.zeros(shape, dtype=numpy.float64) bam_handler = BamHandler(assembly_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_sequence_path) chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=0, stop=chromosome_length) n_reads = parse_reads(chromosome_name=chromosome_name, fasta_handler=fasta_handler, reads=reads, complete_ref_runlengths=runlength_ref_sequences[chromosome_name][LENGTHS], runlength_read_data=runlength_read_data, matrix=matrix) if n_reads > 0: yield (chromosome_name, matrix) else: sys.stderr.write("No reads found for chromosome: %s\n" % chromosome_name)
def get_aligned_contig_lengths(bam_path, aligned_assembly_contigs): bam_handler = BamHandler(bam_file_path=bam_path) reads = bam_handler.get_reads(chromosome_name=None, start=None, stop=None) aligned_lengths = list() n_secondary = 0 for read in reads: if read.is_secondary: n_secondary += 1 if read.mapping_quality > 5 and not read.is_secondary: read_id = read.query_name ref_alignment_start = read.reference_start ref_alignment_stop = get_read_stop_position(read) ref_length = ref_alignment_stop - ref_alignment_start aligned_lengths.append([read_id, ref_length]) print(read_id, ref_length) aligned_lengths = sorted(aligned_lengths, key=lambda x: x[LENGTH], reverse=True) aligned_assembly_contigs[bam_path] = aligned_lengths
def process_bam(bam_path, reference_path): """ Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :return: """ print("\n" + bam_path + "\n") output_dir = "plots/" FileManager.ensure_directory_exists(output_dir) bam_handler = BamHandler(bam_file_path=bam_path) fasta_handler = FastaHandler(reference_path) chromosome_names = ["gi"] for chromosome_name in chromosome_names: chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) start = 0 stop = chromosome_length reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) read_data = parse_reads(reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name) print("chromosome_name:\t", chromosome_name) print("chromosome_length:\t", chromosome_length) for data in read_data: read_id, reversal_status, ref_alignment_start, alignment_length, read_length, contig_length, n_initial_clipped_bases, n_total_mismatches, n_total_deletes, n_total_inserts, identity = data print() print(read_id) print("reversed:\t", reversal_status) print("alignment_start:\t", ref_alignment_start) print("alignment_length:\t", alignment_length) print("n_initial_clipped_bases:", n_initial_clipped_bases) print("n_total_mismatches:\t", n_total_mismatches) print("n_total_deletes:\t", n_total_deletes) print("n_total_inserts:\t", n_total_inserts) print("identity:\t", identity) total_weighted_identity = sum([x[ALIGNMENT_LENGTH] * x[IDENTITY] for x in read_data]) total_alignment_bases = sum([x[ALIGNMENT_LENGTH] for x in read_data]) total_identity = total_weighted_identity/total_alignment_bases print("\nTOTAL IDENTITY:\t", total_identity) plot_contigs(output_dir=output_dir, read_data=read_data, chromosome_name=chromosome_name, chromosome_length=chromosome_length, total_identity=total_identity, bam_path=bam_path, y_min=-1, y_max=4, show=False)
def generate_runlength_frequency_matrix(runlength_ref_sequence_path, read_vs_ref_bam_path, runlength_ref_sequences, runlength_read_sequences): """ Take an alignment of RLE sequences (in BAM format, using minimap as an aligner) in combination with the series of lengths (which have been excluded from the BAM) and aligned observations from Benedicts' model to generate a matrix of true vs observed lengths. :param runlength_ref_sequence_path: :param read_vs_ref_bam_path: :return: """ for chromosome_name in runlength_ref_sequences.keys(): # allowed_chromosomes = {"chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10"} # allowed_chromosomes = {"chrX"} # if chromosome_name not in allowed_chromosomes: # print("WARNING: SKIPPING CHROMOSOME %s BECAUSE NOT IN %s" % (chromosome_name, str(allowed_chromosomes))) # continue # if not chromosome_name.startswith("chrX"): # print("WARNING: SKIPPING CHROMOSOME %s BECAUSE NOT chrX" % chromosome_name) # continue shape = [2, 4, MAX_RUNLENGTH + 1, MAX_RUNLENGTH + 1] matrix = numpy.zeros(shape, dtype=numpy.float64) # print(read_vs_ref_bam_path) bam_handler = BamHandler(read_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_sequence_path) # chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=None, stop=None) n_reads = parse_reads( chromosome_name=chromosome_name, fasta_handler=fasta_handler, reads=reads, complete_ref_runlengths=runlength_ref_sequences[chromosome_name] [LENGTHS], runlength_read_sequences=runlength_read_sequences, matrix=matrix) # plot_base_matrices(matrix=matrix, cutoff=40) if n_reads > 0: yield (chromosome_name, matrix) else: sys.stderr.write("No reads found for chromosome: %s\n" % chromosome_name)
def process_bam(bam_path, reference_path, output_dir=None): """ Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save plots :return: """ print("\n" + bam_path) if output_dir is None: output_dir = "variants/" # Make a subdirectory to contain everything datetime_string = FileManager.get_datetime_string() output_subdirectory = "variants_" + datetime_string output_dir = os.path.join(output_dir, output_subdirectory) FileManager.ensure_directory_exists(output_dir) bam_handler = BamHandler(bam_file_path=bam_path) fasta_handler = FastaHandler(reference_path) chromosome_names = fasta_handler.get_contig_names() chromosome_names = sort_chromosome_names(names=chromosome_names, prefix="chr") print("ref contig names:", chromosome_names) for chromosome_name in chromosome_names: print("Parsing alignments for ref contig:", chromosome_name) chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) start = 0 stop = chromosome_length reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) inserts, deletes, mismatches = parse_reads( reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name) export_variants_to_csv(output_dir=output_dir, chromosome_name=chromosome_name, mismatches=mismatches, inserts=inserts, deletes=deletes, merge=True)
def process_bam(bam_path, reference_path, bac_path, output_dir=None): """ Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save stats :return: """ if output_dir is None: output_dir = "stats/" FileManager.ensure_directory_exists(output_dir) ref_fasta_handler = FastaHandler(reference_path) bac_fasta_handler = FastaHandler(bac_path) chromosome_names = ref_fasta_handler.get_contig_names() bac_names = bac_fasta_handler.get_contig_names() print(chromosome_names) print(bac_names) data_per_bac = defaultdict(list) for chromosome_name in chromosome_names: chromosome_length = ref_fasta_handler.get_chr_sequence_length( chromosome_name) start = 0 stop = chromosome_length ref_fasta_handler = FastaHandler(reference_file_path=reference_path) bam_handler = BamHandler(bam_file_path=bam_path) reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) read_data = parse_reads(reads=reads, fasta_handler=ref_fasta_handler, chromosome_name=chromosome_name) for data in read_data: data_per_bac[data[0]].append([chromosome_name] + data) # filtered_data = filter_supplementaries_by_largest(data_per_bac) filtered_data = aggregate_bac_data(data_per_bac) export_bac_data_to_csv(read_data=filtered_data, output_dir=output_dir, bam_path=bam_path)
def generate_window_fasta(bam_file_path, reference_file_path, chromosome_name, window, output_dir, exclude_loose_ends=True): """ Run the pileup generator for a single specified window :param bam_file_path: :param reference_file_path: :param chromosome_name: :param window: :return: """ bam_handler = BamHandler(bam_file_path) fasta_handler = FastaHandler(reference_file_path) pileup_start = window[0] pileup_end = window[1] # add random variation here ? reads_found = True ref_sequence, read_ids, sequences, reversal_statuses = get_aligned_segments( fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, include_ref=True, exclude_loose_ends=exclude_loose_ends) if sequences is not None: for sequence in sequences: print(len(sequence)) print(len(ref_sequence)) FileManager.ensure_directory_exists(output_dir) sequences_output_filename = '_'.join( [chromosome_name, str(window[0]), str(window[1])]) + ".fasta" sequences_output_path = os.path.join(output_dir, sequences_output_filename) fasta_writer = FastaWriter(sequences_output_path) fasta_writer.write_sequences(sequences) ref_output_filename = '_'.join( [chromosome_name, str(window[0]), str(window[1]), "ref"]) + ".fasta" ref_output_path = os.path.join(output_dir, ref_output_filename) fasta_writer = FastaWriter(ref_output_path) fasta_writer.write_sequences([ref_sequence]) # print("saving sequences as fasta: ", sequences_output_path, ref_output_path) else: reads_found = False return reads_found
def main(bam_file_path, cutoff, contig_name): # ---- GIAB E. Coli - (dev machine) --------------------------------------- # bam_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa" # bam_file_path = "/home/ryan/data/Nanopore/ecoli/flapppie/03_22_19_R941_gEcoli_first_410k_VS_refEcoli.sorted.bam" # ------------------------------------------------------------------------- bam_handler = BamHandler(bam_file_path) reads = bam_handler.get_reads(chromosome_name=contig_name, start=None, stop=None) all_counts = defaultdict(lambda: Counter()) sys.stderr.write("reading file...\n") sys.stderr.flush() c = 0 for read in reads: if read.mapping_quality <= 5 or read.is_secondary or read.is_unmapped \ or read.is_qcfail: continue c += 1 if c % 100 == 0: sys.stderr.write("\rParsed %d reads" % c) if c > cutoff: break sequence = read.query_sequence # print(read.query_name) # print(len(sequence)) # print(sequence[:10]) character_counts = count_runlength_per_character(sequence) for character in character_counts: all_counts[character] += character_counts[character] sys.stderr.write("\n") for character in sorted(all_counts): print(">%s" % character) for length in sorted(all_counts[character].keys()): print(length, all_counts[character][length])
def generate_runlength_frequency_matrix(runlength_ref_sequence_path, assembly_vs_ref_bam_path, runlength_ref_sequences, runlength_assembly_sequences): """ Take an alignment of RLE sequences (in BAM format, using minimap as an aligner) in combination with the series of lengths (which have been excluded from the BAM) and aligned observations from Benedicts' model to generate a matrix of true vs observed lengths. :param chromosome_name: :param runlength_ref_sequence_path: :param assembly_vs_ref_bam_path: :param runlength_ref_sequence: :param runlength_ref_lengths: :param observations: :return: """ for chromosome_name in runlength_ref_sequences: shape = [2, 4, MAX_RUNLENGTH + 1, MAX_RUNLENGTH + 1] matrix = numpy.zeros(shape, dtype=numpy.float64) # print(assembly_vs_ref_bam_path) bam_handler = BamHandler(assembly_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_sequence_path) chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=0, stop=chromosome_length) parse_reads( chromosome_name=chromosome_name, fasta_handler=fasta_handler, reads=reads, complete_ref_runlengths=runlength_ref_sequences[chromosome_name] [LENGTHS], runlength_assembly_sequences=runlength_assembly_sequences, matrix=matrix) # plot_base_matrices(matrix=matrix, cutoff=40) yield matrix
def parse_bam(bam_path, reference_path): """ Iterate a BAM file and count summary stats from that file :param bam_path: :param reference_path: :return: """ fasta_handler = FastaHandler(reference_path) chromosome_names = fasta_handler.get_contig_names() chromosomal_cigar_counts = defaultdict(lambda: defaultdict(int)) n_alignments = 0 n_primary = 0 n_supplementary = 0 n_secondary = 0 map_qualities = IterativeHistogram(start=0, stop=60, n_bins=6) for chromosome_name in chromosome_names: bam_handler = BamHandler(bam_path) chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=0, stop=chromosome_length) chromosomal_cigar_counts, \ n_alignments, \ n_primary, \ n_supplementary, \ n_secondary, \ map_qualities = count_cigar_operations(reads=reads, chromosome_name=chromosome_name, chromosomal_cigar_counts=chromosomal_cigar_counts, n_alignments=n_alignments, n_primary=n_primary, n_supplementary=n_supplementary, n_secondary=n_secondary, map_qualities=map_qualities) return chromosomal_cigar_counts, n_alignments, n_primary, n_supplementary, n_secondary, map_qualities
def generate_collapsed_data(bam_file_path, reference_file_path, vcf_path, bed_path, chromosome_name, start_position, end_position, generate_from_vcf=False): """ Generate pileups from BAM data, and collapse sequences to have no explicitly repeated characters. Additionally encode a repeat channel that describes the number of repeats observed per base. :param bam_file_path: :param reference_file_path: :param vcf_path: :param chromosome_name: :param start_position: :param end_position: :return: """ bam_handler = BamHandler(bam_file_path) fasta_handler = FastaHandler(reference_file_path) if generate_from_vcf: chromosomal_windows = get_variant_windows( vcf_path=vcf_path, chromosome_name=chromosome_name, start_position=start_position, end_position=end_position) else: chromosomal_windows = get_non_variant_windows( vcf_path=vcf_path, bed_path=bed_path, chromosome_name=chromosome_name, start_position=start_position, end_position=end_position) for chromosome_name in chromosomal_windows: for w, window in enumerate(chromosomal_windows[chromosome_name]): pileup_start = window[0] pileup_end = window[1] # add random variation here print(pileup_start, pileup_end) ref_sequence, read_ids, sequences = get_aligned_segments( fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end) character_sequences, character_counts = collapse_repeats(sequences) print_collapsed_segments(character_sequences, character_counts) if w == 0: exit()
def generate_data(bam_file_path, reference_file_path, vcf_path, bed_path, chromosome_name, start_position, end_position, generate_from_vcf=False): """ Generate pileup for read segments aligned between two genomic coordinates :param bam_file_path: :param reference_file_path: :param vcf_path: :param chromosome_name: :param start_position: :param end_position: :return: """ bam_handler = BamHandler(bam_file_path) fasta_handler = FastaHandler(reference_file_path) if generate_from_vcf: chromosomal_windows = get_variant_windows( vcf_path=vcf_path, chromosome_name=chromosome_name, start_position=start_position, end_position=end_position) else: chromosomal_windows = get_non_variant_windows( vcf_path=vcf_path, bed_path=bed_path, chromosome_name=chromosome_name, start_position=start_position, end_position=end_position) for chromosome_name in chromosomal_windows: for w, window in enumerate(chromosomal_windows[chromosome_name]): pileup_start = window[0] pileup_end = window[1] # add random variation here print(pileup_start, pileup_end) ref_sequence, read_ids, sequences = get_aligned_segments( fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end) if w == 10: exit()
def get_chromosome_stats(genome_data, reference_path, chromosome_name, start, stop, output_dir, bam_path): fasta_handler = FastaHandler(reference_file_path=reference_path) bam_handler = BamHandler(bam_file_path=bam_path) chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) read_data, chromosome_data = parse_reads( reads=reads, chromosome_name=chromosome_name, chromosome_length=chromosome_length, fasta_handler=fasta_handler) genome_data.append(chromosome_data) export_chromosome_summary_to_csv(read_data=read_data, chromosome_data=chromosome_data, output_dir=output_dir, bam_path=bam_path, chromosome_name=chromosome_name)
def test_window(bam_file_path, reference_file_path, chromosome_name, window, output_dir, save_data=True, print_results=False): """ Run the pileup generator for a single specified window :param bam_file_path: :param reference_file_path: :param chromosome_name: :param window: :return: """ bam_handler = BamHandler(bam_file_path) fasta_handler = FastaHandler(reference_file_path) pileup_start = window[0] pileup_end = window[1] # add random variation here ? ref_sequence, read_ids, sequences = get_aligned_segments( fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end) if print_results: print_segments(ref_sequence, sequences) if save_data: filename = "test_" + str(pileup_start) + ".fasta" output_path = os.path.join(output_dir, filename) if not os.path.exists(output_dir): FileManager.ensure_directory_exists(output_dir) fasta_writer = FastaWriter(output_path) fasta_writer.write_sequences(sequences)
def main(): # bam_file_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_sequence_2019_3_27_14_59_24_409353/sequence_subset_test_60x_10kb_rle_VS_refEcoli_rle.sorted.bam" # ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" bam_file_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_output_2019_4_8_17_33_14_191911/runnie_subset_test_60x_10kb_rle_VS_refEcoli_rle.sorted.bam" ref_fasta_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_output_2019_4_8_17_33_14_191911/refEcoli_rle.fasta" # ------------------------------------------------------------------------- fasta_handler = FastaHandler(ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) bam_handler = BamHandler(bam_file_path) fasta_handler = FastaHandler(ref_fasta_path) pileup_start = 0 pileup_end = pileup_start + 1000 # add random variation here ? aligned_segments = get_aligned_segments(fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, include_ref=True) encoding = list() for alignment in aligned_segments.values(): encoding.append(list(map(get_encoding, alignment))) encoding = -numpy.array(encoding, dtype=numpy.float) pyplot.imshow(encoding) pyplot.show() pyplot.close()
def get_chromosome_data(bam_path, reference_path, chromosome_name, output_dir, centromere_table_path, gap_table_path, segdup_table_path, genome_data): fasta_handler = FastaHandler(reference_path) bam_handler = BamHandler(bam_file_path=bam_path) chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) start = 0 stop = chromosome_length reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) read_data, chromosome_data = parse_reads( reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name, chromosome_length=chromosome_length) genome_data.append(chromosome_data) # Calculate total identity, and approximate 0 if denominator is zero for F and R total_weighted_identity = sum( [x[ALIGNMENT_LENGTH] * x[SEQUENCE_IDENTITY] for x in read_data]) total_alignment_bases = sum([x[ALIGNMENT_LENGTH] for x in read_data]) total_identity = total_weighted_identity / max(1e-9, total_alignment_bases) total_identity = round(total_identity, 6) export_chromosome_summary_to_csv(read_data=read_data, chromosome_data=chromosome_data, output_dir=output_dir, bam_path=bam_path, chromosome_name=chromosome_name) if centromere_table_path is not None: centromere_coordinates = read_centromere_table( centromere_table_path=centromere_table_path, target_chromosome_name=chromosome_name) else: centromere_coordinates = None if gap_table_path is not None: gap_coordinates = read_gap_table( table_path=gap_table_path, target_chromosome_name=chromosome_name) else: gap_coordinates = None if segdup_table_path is not None: segdup_coordinates = read_gap_table( table_path=segdup_table_path, target_chromosome_name=chromosome_name, size_cutoff=10000) else: segdup_coordinates = None figure, axes = plot_contigs(output_dir=output_dir, read_data=read_data, chromosome_name=chromosome_name, chromosome_length=chromosome_length, total_identity=total_identity, bam_path=bam_path, centromere_coordinates=centromere_coordinates, gap_coordinates=gap_coordinates, segdup_coordinates=segdup_coordinates, show=False) pyplot.close(figure)
def generate_window_run_length_encoding(bam_file_path, reference_file_path, chromosome_name, window, output_dir, sort_sequences_by_length=False, reverse_sort=False, two_pass=False, save_data=True, print_results=False, plot_results=False, counter=None, n_chunks=None): """ Run the pileup generator for a single specified window :param bam_file_path: :param reference_file_path: :param chromosome_name: :param window: :return: """ bam_handler = BamHandler(bam_file_path) fasta_handler = FastaHandler(reference_file_path) pileup_start = window[0] pileup_end = window[1] # add random variation here ? ref_sequence, read_ids, sequences, reversal_statuses = get_aligned_segments( fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, include_ref=True) if sequences is None: return if sort_sequences_by_length: for sequence in sequences: print(sequence) print() sequences = sorted(sequences, key=lambda x: len(x), reverse=reverse_sort) for sequence in sequences: print(sequence) print("ref", ref_sequence) sequences, repeats = collapse_repeats(sequences) ref_sequence, ref_repeats = collapse_repeats([ref_sequence]) ref_sequence = ref_sequence[0] alignments, ref_alignment = get_spoa_alignment(sequences=sequences, ref_sequence=ref_sequence, two_pass=two_pass) pileup_matrix, pileup_repeat_matrix = convert_collapsed_alignments_to_one_hot_tensor( alignments, repeats, fixed_coverage=False) reference_matrix, reference_repeat_matrix = convert_collapsed_alignments_to_one_hot_tensor( ref_alignment, ref_repeats, fixed_coverage=False) reversal_matrix = convert_reversal_statuses_to_integer_matrix( reverse_statuses=reversal_statuses, pileup_matrix=pileup_matrix) if plot_results: n_channels, height, width = pileup_matrix.shape x_pileup = pileup_matrix.reshape([n_channels, height, width]) y_pileup = reference_matrix.reshape([5, 1, width]) x_repeat = pileup_repeat_matrix.reshape([1, height, width]) y_repeat = reference_repeat_matrix.reshape([1, width]) reversal = reversal_matrix.reshape([1, height, width]) x_pileup_flat = flatten_one_hot_tensor(x_pileup) y_pileup_flat = flatten_one_hot_tensor(y_pileup) plot_runlength_prediction_stranded(x_pileup=x_pileup_flat, x_repeat=x_repeat.squeeze(), y_pileup=y_pileup_flat, y_repeat=y_repeat, reversal=reversal.squeeze(), show_reversal=False, label=True) if print_results: print_segments(ref_sequence, sequences) for a, alignstring in enumerate(alignments): print("{0:15s} {1:s}".format(str(a), alignstring)) for alignstring in ref_alignment: print("{0:15s} {1:s}".format("ref", alignstring)) if ref_alignment[0].replace("-", '') != ref_sequence: print("Aligned reference does not match true reference at [%d,%d]" % (pileup_start, pileup_end)) print("unaligned:\t", ref_sequence) print("aligned:\t", ref_alignment[0][1].replace("-", '')) elif save_data: save_run_length_training_data( output_dir=output_dir, pileup_matrix=pileup_matrix, reference_matrix=reference_matrix, pileup_repeat_matrix=pileup_repeat_matrix, reference_repeat_matrix=reference_repeat_matrix, reversal_matrix=reversal_matrix, chromosome_name=chromosome_name, start=pileup_start) if counter is not None: counter.value += 1 sys.stdout.write('\r' + "%.2f%% Completed" % (100 * counter.value / n_chunks))
def main(): ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" read_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/guppy/subsampled/11-29/r94_ec_rad2.30x-30kb.fasta" # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/v2/rad2_pass_runnie_0_1_10_11_12_13_v2.fa" # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_14_29_ecoli_wg_guppy_NO_BAYES/Assembly.fasta" # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_15_40_ecoli_wg_guppy_BAYES/Assembly.fasta" # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/rad2_pass_runnie_0_v2.fa" # ---- TEST DATA ---- # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_ref.fasta" # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_reads.fasta" # ------------------- output_parent_dir = "output/" output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) read_fasta_filename_prefix = ".".join( os.path.basename(read_fasta_path).split(".")[:-1]) runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta" runlength_read_fasta_path = os.path.join(output_dir, runlength_read_fasta_filename) sys.stderr.write("RL encoding fasta...\n") runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) runlength_read_sequences = runlength_encode_fasta( fasta_sequence_path=read_fasta_path) sys.stderr.write("Aligning RLE fasta...\n") read_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_read_fasta_path, runlength_read_sequences=runlength_read_sequences, output_dir=output_dir) bam_handler = BamHandler(read_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) print(chromosome_length) sequences, lengths = get_read_segments( fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=100000, pileup_end=100000 + 100, runlength_ref_sequences=runlength_ref_sequences, read_data=runlength_read_sequences) for k, key in enumerate(sequences): print(key) print(sequences[key][:10]) print(lengths[key][:10])
def main(): # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_ref.fasta" # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_reads.fasta" # matrix_path = "/home/ryan/code/runnie_parser/output/runlength_matrix_from_assembly_contigs_2019_3_19_13_29_14_657613/probability_matrices_2019_3_19_13_29_19_362916.csv" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" read_fasta_path = "/home/ryan/code/runlength_analysis/data/sequence_subset_ecoli_guppy-runnie_60x_test.fastq" matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_sequence_2019_4_5_15_29_28_403950/probability_matrices_2019_4_5_15_35_57_920301.csv" output_parent_dir = "output/" output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) read_fasta_filename_prefix = ".".join( os.path.basename(read_fasta_path).split(".")[:-1]) runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta" runlength_read_fasta_path = os.path.join(output_dir, runlength_read_fasta_filename) runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) runlength_read_sequences = runlength_encode_fasta( fasta_sequence_path=read_fasta_path) read_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_read_fasta_path, runlength_read_sequences=runlength_read_sequences, output_dir=output_dir) bam_handler = BamHandler(read_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) windows = chunk_chromosome_coordinates(chromosome_length=chromosome_length, chunk_size=1000) # Initialize empty confusion matrices total_confusion = get_runlength_confusion([], [], 10) total_modal_confusion = get_runlength_confusion([], [], 10) length_classifier = RunlengthClassifier(matrix_path) print("reading BAM") for pileup_start, pileup_end in windows[:10]: print("window", pileup_start, pileup_end) sys.stderr.write("\r%s" % pileup_start) aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_lengths, reversal_statuses = \ get_aligned_segments(fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, runlength_ref_sequences=runlength_ref_sequences, read_data=runlength_read_sequences) sequence_encoding = list() length_encoding = list() reversal_encoding = list() # No reads here? if len(aligned_sequences) == 0: continue # print("REF\t", "".join(aligned_ref_sequence)) for read_id in aligned_sequences.keys(): # print("READ\t","".join(aligned_sequences[read_id])) sequence_encoding.append( list(map(get_encoding, aligned_sequences[read_id]))) length_encoding.append(aligned_lengths[read_id]) reversal_encoding.append(reversal_statuses[read_id]) ref_sequence_encoding = [list(map(get_encoding, aligned_ref_sequence))] ref_lengths_encoding = [aligned_ref_lengths] ref_sequence_encoding = numpy.array(ref_sequence_encoding, dtype=numpy.int) ref_length_encoding = numpy.array(ref_lengths_encoding, dtype=numpy.int) sequence_encoding = numpy.array(sequence_encoding, dtype=numpy.int) length_encoding = numpy.array(length_encoding, dtype=numpy.float) reversal_encoding = numpy.array(reversal_encoding, dtype=numpy.bool) ref_sequence_encoding = numpy.atleast_2d(ref_sequence_encoding) ref_length_encoding = numpy.atleast_2d(ref_length_encoding) sequence_encoding = numpy.atleast_2d(sequence_encoding) length_encoding = numpy.atleast_2d(length_encoding) # plot_runlength_pileup(sequences=-sequence_encoding, # lengths=length_encoding, # ref_sequence=-ref_sequence_encoding, # ref_lengths=ref_length_encoding) consensus_sequence, consensus_lengths = \ get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier, sequence_encoding=sequence_encoding, length_encoding=length_encoding, reversal_encoding=reversal_encoding) modal_consensus_sequence, modal_consensus_lengths = \ get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier, sequence_encoding=sequence_encoding, length_encoding=length_encoding, reversal_encoding=reversal_encoding, bayesian=False) print() print("PREDICTED\t", consensus_lengths[:10]) print("TRUE\t\t", aligned_ref_lengths[:10]) confusion = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=consensus_lengths, max_length=10) total_confusion += confusion modal_confusion = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=modal_consensus_lengths, max_length=10) total_modal_confusion += modal_confusion # except Exception as e: # print(e) # continue print() accuracy = get_accuracy_from_confusion_matrix(total_confusion) print("Bayes:", accuracy) accuracy = get_accuracy_from_confusion_matrix(total_modal_confusion) print("No Bayes", accuracy) plot_filename = "confusion.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_confusion)) pyplot.show() figure.savefig(plot_path) pyplot.close() plot_filename = "modal_confusion.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_modal_confusion)) pyplot.show() figure.savefig(plot_path) pyplot.close()
def main(): # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta" # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out" # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runnie_test_2019_4_8_14_33_30_333396_ref.fasta" # runlength_path = "/home/ryan/code/runlength_analysis/data/synthetic_runnie_test_2019_4_8_14_33_30_333396_runnie.out" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" runlength_path = "/home/ryan/code/runlength_analysis/data/runnie_subset_test_flipflop_regional_0to10k.out" # WG ecoli 60x matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_WG_train_60x_guppy_2019_4_23/probability_matrices_2019_4_23_15_9_14_837893.csv" raw_matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_WG_train_60x_guppy_2019_4_23/frequency_matrices_2019_4_23_15_9_14_833128.csv" output_parent_dir = "output/" output_dir = "runlength_prediction_from_runnie_output_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) assembly_fasta_filename_prefix = ".".join( os.path.basename(runlength_path).split(".")[:-1]) runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta" runlength_assembly_fasta_path = os.path.join( output_dir, runlength_assembly_fasta_filename) handler = RunlengthHandler(runlength_path) reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True) read_data = dict() for r, read in enumerate(reads): read_data[read.id] = read print("\nRLE encoding reference sequence...") runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) assembly_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_assembly_fasta_path, runlength_read_sequences=read_data, output_dir=output_dir) bam_handler = BamHandler(assembly_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) windows = chunk_chromosome_coordinates(chromosome_length=chromosome_length, chunk_size=1000) total_confusion = get_runlength_confusion([], [], 10) total_confusion_weibull = get_runlength_confusion([], [], 10) length_classifier = RunlengthClassifier(matrix_path) # length_classifier_weibull = WeibullRunlengthClassifier(matrix_path) length_classifier_weibull = WeibullRunlengthClassifier( raw_matrix_path, normalize_matrix=True, pseudocount=0.05) print("reading BAM") for pileup_start, pileup_end in windows[10:20]: sys.stderr.write("\r%s" % pileup_start) aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_scales, aligned_shapes, reversal_statuses = \ get_aligned_segments(fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, runlength_ref_sequences=runlength_ref_sequences, read_data=read_data) sequence_encoding = list() scale_encoding = list() shape_encoding = list() modes_encoding = list() reversal_encoding = list() # No reads here? if len(aligned_sequences) == 0: continue try: # print("REF\t", "".join(aligned_ref_sequence)) for read_id in aligned_sequences.keys(): # print("READ\t%s\t%s" % (read_id,"".join(aligned_sequences[read_id]))) sequence_encoding.append( list(map(get_encoding, aligned_sequences[read_id]))) scale_encoding.append(aligned_scales[read_id]) shape_encoding.append(aligned_shapes[read_id]) modes_encoding.append( list( map( map_parameters_to_mode, zip(aligned_scales[read_id], aligned_shapes[read_id])))) reversal_encoding.append(reversal_statuses[read_id]) ref_sequence_encoding = [ list(map(get_encoding, aligned_ref_sequence)) ] ref_lengths_encoding = [aligned_ref_lengths] ref_sequence_encoding = numpy.atleast_2d( numpy.array(ref_sequence_encoding, dtype=numpy.int)) ref_length_encoding = numpy.atleast_2d( numpy.array(ref_lengths_encoding, dtype=numpy.int)) sequence_encoding = numpy.atleast_2d( numpy.array(sequence_encoding, dtype=numpy.int)) scale_encoding = numpy.atleast_2d( numpy.array(scale_encoding, dtype=numpy.float)) shape_encoding = numpy.atleast_2d( numpy.array(shape_encoding, dtype=numpy.float)) modes_encoding = numpy.atleast_2d( numpy.array(modes_encoding, dtype=numpy.int)) reversal_encoding = numpy.array(reversal_encoding, dtype=numpy.bool) consensus_sequence, consensus_lengths = \ get_consensus_from_modal_pileup_encoding(length_classifier=length_classifier, sequence_encoding=sequence_encoding, length_encoding=modes_encoding, reversal_encoding=reversal_encoding) weibull_consensus_sequence, weibull_consensus_lengths = \ get_consensus_from_weibull_pileup_encoding(length_classifier=length_classifier_weibull, sequence_encoding=sequence_encoding, scale_encoding=scale_encoding, shape_encoding=shape_encoding, reversal_encoding=reversal_encoding) plot_runlength_pileup( sequences=-sequence_encoding, scales=scale_encoding, shapes=shape_encoding, modes=modes_encoding, ref_sequence=-ref_sequence_encoding, ref_lengths=ref_length_encoding, predicted_sequence=-numpy.atleast_2d( numpy.array(weibull_consensus_sequence, dtype=numpy.int)), predicted_lengths=numpy.atleast_2d( numpy.array(weibull_consensus_lengths, dtype=numpy.int))) print() print("PREDICTED\t", weibull_consensus_lengths[:10]) print("TRUE\t\t", aligned_ref_lengths[:10]) confusion = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=consensus_lengths, max_length=10) confusion_weibull = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=weibull_consensus_lengths, max_length=10) total_confusion += confusion total_confusion_weibull += confusion_weibull except Exception as e: print(e) continue print() accuracy = get_accuracy_from_confusion_matrix(total_confusion) print("Modal: ", accuracy) accuracy = get_accuracy_from_confusion_matrix(total_confusion_weibull) print("Full: ", accuracy) plot_filename = "confusion.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_confusion)) pyplot.show() figure.savefig(plot_path) pyplot.close() plot_filename = "confusion_weibull.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_confusion_weibull)) pyplot.show() figure.savefig(plot_path) pyplot.close()
def main(): # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta" # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" runlength_path = "/home/ryan/code/runlength_analysis/data/runnie_subset_test_flipflop_regional_0to10k.out" pileup_start = 6000 pileup_end = 6050 output_parent_dir = "output/" output_dir = "runlength_pileup_test_" + FileManager.get_datetime_string() output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join(os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) assembly_fasta_filename_prefix = ".".join(os.path.basename(runlength_path).split(".")[:-1]) runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta" runlength_assembly_fasta_path = os.path.join(output_dir, runlength_assembly_fasta_filename) handler = RunlengthHandler(runlength_path) reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True) read_data = dict() for r, read in enumerate(reads): read_data[read.id] = read print("\nRLE encoding reference sequence...") runlength_ref_sequences = runlength_encode_fasta(fasta_sequence_path=ref_fasta_path) assembly_vs_ref_bam_path = align_as_RLE(runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_assembly_fasta_path, runlength_read_sequences=read_data, output_dir=output_dir) bam_handler = BamHandler(assembly_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_scales, aligned_shapes, reversal_statuses = \ get_aligned_segments(fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, runlength_ref_sequences=runlength_ref_sequences, read_data=read_data) sequence_encoding = list() scale_encoding = list() shape_encoding = list() modes_encoding = list() print(len(aligned_sequences.keys())) print("REF\t", "".join(aligned_ref_sequence)) for read_id in aligned_sequences.keys(): print("READ\t%s\t%s" % (read_id, "".join(aligned_sequences[read_id]))) sequence_encoding.append(list(map(get_encoding, aligned_sequences[read_id]))) scale_encoding.append(aligned_scales[read_id]) shape_encoding.append(aligned_shapes[read_id]) modes_encoding.append(list(map(map_parameters_to_mode, zip(aligned_scales[read_id], aligned_shapes[read_id])))) sequence_encoding = -numpy.array(sequence_encoding, dtype=numpy.float) scale_encoding = numpy.array(scale_encoding, dtype=numpy.float) shape_encoding = numpy.array(shape_encoding, dtype=numpy.float) modes_encoding = numpy.array(modes_encoding, dtype=numpy.float) plot_runlength_pileup(sequences=sequence_encoding, scales=scale_encoding, shapes=shape_encoding, modes=modes_encoding)
def main(): # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta" # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/test/rad2_pass_runnie_4_5_6_7.out" output_parent_dir = "output/" output_dir = "runlength_matrix_from_runnie_output_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) assembly_fasta_filename_prefix = ".".join( os.path.basename(runlength_path).split(".")[:-1]) runlength_read_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta" runlength_read_fasta_path = os.path.join(output_dir, runlength_read_fasta_filename) handler = RunlengthHandler(runlength_path) reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True) read_data = dict() for r, read in enumerate(reads): read_data[read.id] = read print("\nRLE encoding reference sequence...") runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) assembly_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_read_fasta_path, runlength_read_sequences=read_data, output_dir=output_dir) bam_handler = BamHandler(assembly_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) sequences, scales, shapes = get_read_segments( fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=100000, pileup_end=100000 + 100, runlength_ref_sequences=runlength_ref_sequences, read_data=read_data) for k, key in enumerate(sequences): print(key) print(sequences[key][:10]) print(scales[key][:10]) print(shapes[key][:10])