Exemplo n.º 1
0
def plot_kernel_distribution(pdf,
                             cdf,
                             bins,
                             save=False,
                             output_dir=None,
                             filename=None):
    n_steps = 100
    step = float(1.0 / n_steps)
    center = (bins[:-1] + bins[1:]) / 2 - step / 2

    fig, axes = pyplot.subplots(nrows=2)
    axes[0].plot(cdf)
    axes[1].bar(center, pdf, width=step, align="center")
    axes[1].set_ylabel("kernel sum")

    if save:
        FileManager.ensure_directory_exists(output_dir)
        filename = filename + "_distributions.png"
        path = os.path.join(output_dir, filename)
        pyplot.savefig(path)

    else:
        pyplot.show()

    pyplot.close()
Exemplo n.º 2
0
def main(max_threads=None):
    # runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_runnie_0.out"
    runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_runnie_0_1_10_11_12_13.out"

    output_parent_dir = "output/version_comparison/mode/"
    output_dir = "runlength_matrix_from_assembly_contigs_" + FileManager.get_datetime_string()
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    handler = RunlengthHandler(runlength_path)

    if max_threads is None:
        max_threads = max(1, multiprocessing.cpu_count()-2)

    with multiprocessing.Pool(processes=max_threads) as pool:
        for r,read_id in enumerate(pool.imap(arg_unpacker, arg_iterator(handler=handler, output_dir=output_dir))):
            sys.stdout.write("\r%d" % r)
    print()

    print("Concatenating files...")
    output_file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=output_dir, file_extension=".fasta")

    concatenated_filename = os.path.basename(runlength_path).split(".")[0] + ".fasta"
    concatenated_file_path = os.path.join(output_dir, concatenated_filename)

    print("Saving to file: %s" % concatenated_file_path)

    FileManager.concatenate_files(file_paths=output_file_paths, output_file_path=concatenated_file_path)
    FileManager.delete_files(output_file_paths)
Exemplo n.º 3
0
def save_run_length_training_data(output_dir, pileup_matrix, reference_matrix,
                                  pileup_repeat_matrix,
                                  reference_repeat_matrix, reversal_matrix,
                                  chromosome_name, start):
    array_file_extension = ".npz"

    # ensure chromosomal directory exists
    chromosomal_output_dir = os.path.join(output_dir, chromosome_name)
    if not os.path.exists(chromosomal_output_dir):
        FileManager.ensure_directory_exists(chromosomal_output_dir)

    # generate unique filename and path
    filename = chromosome_name + "_" + str(start)

    output_path_prefix = os.path.join(chromosomal_output_dir, filename)

    data_path = output_path_prefix + "_matrix" + array_file_extension

    # write numpy arrays
    numpy.savez_compressed(data_path,
                           x_pileup=pileup_matrix,
                           y_pileup=reference_matrix,
                           x_repeat=pileup_repeat_matrix,
                           y_repeat=reference_repeat_matrix,
                           reversal=reversal_matrix)
Exemplo n.º 4
0
def main():
    """
    Make a synthetic reference and a set of reads and save them to fasta files as reads.fasta and ref.fasta
    :return:
    """
    output_dir = "data/"
    FileManager.ensure_directory_exists(output_dir)

    n_coverage = 2

    ref_max_runlength = 50
    read_max_runlength = 50

    ref_sequence, observations = generate_sequences(
        ref_max_runlength=ref_max_runlength,
        read_max_runlength=read_max_runlength,
        n_coverage=n_coverage,
        scale_coverage=True)

    datetime_string = FileManager.get_datetime_string()
    filename = "synthetic_coverage_data_marginpolish_" + datetime_string + ".tsv"
    output_path = os.path.join(output_dir, filename)

    file = open(output_path, "w")
    writer = csv.writer(file, delimiter="\t")
    for line in observations:
        writer.writerow(line)
    file.close()

    filename = "synthetic_coverage_data_marginpolish_" + datetime_string + "_ref.fasta"
    output_path = os.path.join(output_dir, filename)

    with open(output_path, "w") as file:
        file.write(">ref_0\n")
        file.write(ref_sequence)
Exemplo n.º 5
0
def main(summary_glob, output_dir, filter_decoys, args):
    FileManager.ensure_directory_exists(output_dir)

    summary_file_paths = glob.glob(summary_glob)
    if len(summary_file_paths) == 0:
        print("No files matched '{}'".format(summary_glob))
        sys.exit(1)

    if filter_decoys:
        print("Filtering decoy chromosomes")
        summary_file_paths = filter_decoys_from_paths(summary_file_paths)

    summary_headers, summary_data, identities, identities_per_file, read_lengths_per_file, read_len_to_identity = \
        aggregate_summary_data(summary_file_paths, args)

    # all_read_lengths = list()
    # for rli in read_len_to_identity:
    #     all_read_lengths.append(rli[0])
    # all_read_lengths.sort()
    # print("top 15 read lengths: {}".format(all_read_lengths[:-15]))

    for file in identities_per_file.keys():
        mmm(identities_per_file[file], file)
    mmm(identities, "All Data")

    sample_name = args.sample
    if sample_name is None:
        sample_name = summary_glob.rstrip('/').replace('/', "_").replace(
            '*', "_")  # replace this with sample name extractor function?

    # plots
    if args.plot:
        pass

        # plot_identity_histogram(identities, title=sample_name, output_location=os.path.join(output_dir, "{}.all_identities.png".format(sample_name)))
        # plot_read_len_to_identity(read_len_to_identity, title=sample_name, output_base=os.path.join(output_dir, "{}.read_len_to_identity".format(sample_name)))
        # plot_per_file_identity_curve(identities_per_file, output_base=os.path.join(output_dir, sample_name))
        if args.comparison_glob is None:
            plot_per_file_identity_violin(identities_per_file,
                                          title=sample_name,
                                          output_base=os.path.join(
                                              output_dir, sample_name))
        else:
            comparison_paths = glob.glob(args.comparison_glob)
            if len(comparison_paths) == 0:
                raise Exception("No comparison files found for '{}'".format(
                    args.comparison_glob))

            #TODO only for rle experiment
            args.min_read_length *= 0.7
            _, _, _, comparison_identities_per_file, comparison_lengths_per_file, _ = aggregate_summary_data(
                comparison_paths, args)
            plot_identity_comparison_violin(identities_per_file,
                                            comparison_identities_per_file,
                                            read_lengths_per_file,
                                            comparison_lengths_per_file,
                                            title=sample_name,
                                            output_base=os.path.join(
                                                output_dir, sample_name))
Exemplo n.º 6
0
def write_windows_to_file(windows, output_dir, filename):
    FileManager.ensure_directory_exists(output_dir)

    filename = filename + "_windows.pkl"
    path = os.path.join(output_dir, filename)

    with open(path, 'wb') as output:
        pickle.dump(windows, output, pickle.HIGHEST_PROTOCOL)
Exemplo n.º 7
0
def save_model(output_directory, model):
    FileManager.ensure_directory_exists(output_directory)

    timestamp = get_timestamp_string()
    filename = "model_" + timestamp
    path = os.path.join(output_directory, filename)

    print("SAVING MODEL:", path)
    torch.save(model.state_dict(), path)
Exemplo n.º 8
0
def process_bam(bam_path, reference_path):
    """
    Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :return:
    """
    print("\n" + bam_path + "\n")

    output_dir = "plots/"
    FileManager.ensure_directory_exists(output_dir)

    bam_handler = BamHandler(bam_file_path=bam_path)
    fasta_handler = FastaHandler(reference_path)

    chromosome_names = ["gi"]

    for chromosome_name in chromosome_names:
        chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

        start = 0
        stop = chromosome_length

        reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop)

        read_data = parse_reads(reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name)

        print("chromosome_name:\t", chromosome_name)
        print("chromosome_length:\t", chromosome_length)
        for data in read_data:
            read_id, reversal_status, ref_alignment_start, alignment_length, read_length, contig_length, n_initial_clipped_bases, n_total_mismatches, n_total_deletes, n_total_inserts, identity = data
            print()
            print(read_id)
            print("reversed:\t", reversal_status)
            print("alignment_start:\t", ref_alignment_start)
            print("alignment_length:\t", alignment_length)
            print("n_initial_clipped_bases:", n_initial_clipped_bases)
            print("n_total_mismatches:\t", n_total_mismatches)
            print("n_total_deletes:\t", n_total_deletes)
            print("n_total_inserts:\t", n_total_inserts)
            print("identity:\t", identity)

        total_weighted_identity = sum([x[ALIGNMENT_LENGTH] * x[IDENTITY] for x in read_data])
        total_alignment_bases = sum([x[ALIGNMENT_LENGTH] for x in read_data])
        total_identity = total_weighted_identity/total_alignment_bases

        print("\nTOTAL IDENTITY:\t", total_identity)

        plot_contigs(output_dir=output_dir,
                     read_data=read_data,
                     chromosome_name=chromosome_name,
                     chromosome_length=chromosome_length,
                     total_identity=total_identity,
                     bam_path=bam_path,
                     y_min=-1,
                     y_max=4,
                     show=False)
Exemplo n.º 9
0
def extract_runnie_reads_by_name(runnie_path, output_dir, output_filename_suffix, names):
    output_filename = "runnie_subset_" + output_filename_suffix + ".out"
    output_path = os.path.join(output_dir, output_filename)
    FileManager.ensure_directory_exists(output_dir)

    runnie_handler = RunlengthHandler(runnie_path)

    runnie_handler.extract_reads_by_id(id_set=names, output_path=output_path, print_status=True)

    return output_path
Exemplo n.º 10
0
def extract_fastq_reads_by_name(fastq_path, output_dir, output_filename_suffix, names):
    output_filename = "sequence_subset_" + output_filename_suffix + ".fastq"
    output_path = os.path.join(output_dir, output_filename)
    FileManager.ensure_directory_exists(output_dir)

    fastq_handler = FastqHandler(fastq_path)

    fastq_handler.extract_reads_by_id(id_set=names, output_path=output_path, print_status=True)

    return output_path
Exemplo n.º 11
0
def main(reads_file_path,
         true_ref_sequence_path=None,
         output_dir=None,
         n_passes=False):
    if output_dir is None:
        output_dir = "./"
    else:
        FileManager.ensure_directory_exists(output_dir)

    assembly_sequence_path = assemble_wtdbg2(output_dir=output_dir,
                                             input_file_path=reads_file_path)

    reads_vs_ref_sam_path, reads_vs_ref_bam_path = align_minimap(
        output_dir=output_dir,
        ref_sequence_path=assembly_sequence_path,
        reads_sequence_path=reads_file_path)

    if true_ref_sequence_path is not None:
        assembled_vs_true_ref_sam_path, assembled_vs_true_ref_bam_path = align_minimap(
            output_dir=output_dir,
            ref_sequence_path=true_ref_sequence_path,
            reads_sequence_path=assembly_sequence_path)

    polished_ref_paths = list()

    for i in range(n_passes):
        suffix = str(i + 1) + "x"
        polish_output_dir = join(output_dir, suffix)
        FileManager.ensure_directory_exists(polish_output_dir)

        if i == 0:
            ref_sequence_path = assembly_sequence_path
        else:
            ref_sequence_path = polished_ref_paths[i - 1]

        reads_vs_polished_ref_sam_path, reads_vs_polished_ref_bam_path = align_minimap(
            output_dir=polish_output_dir,
            ref_sequence_path=ref_sequence_path,
            reads_sequence_path=reads_file_path)

        repolished_ref_sequence_path = polish_racon(
            output_dir=polish_output_dir,
            reads_file_path=reads_file_path,
            reads_vs_ref_sam_path=reads_vs_polished_ref_sam_path,
            ref_sequence_path=ref_sequence_path,
            suffix=suffix)

        polished_ref_paths.append(repolished_ref_sequence_path)

        if true_ref_sequence_path is not None:
            repolished_vs_true_ref_sam_path, repolished_vs_true_ref_bam_path = \
                align_minimap(output_dir=polish_output_dir,
                              ref_sequence_path=true_ref_sequence_path,
                              reads_sequence_path=repolished_ref_sequence_path)
Exemplo n.º 12
0
def main(ref_sequence_path, reads_sequence_path, minimap_preset, output_dir=None):
    if output_dir is None:
        output_dir = "./"
    else:
        FileManager.ensure_directory_exists(output_dir)

    output_sam_file_path, output_bam_file_path = align_minimap(output_dir=output_dir,
                                                               ref_sequence_path=ref_sequence_path,
                                                               reads_sequence_path=reads_sequence_path,
                                                               preset=minimap_preset)

    process_bam(bam_path=output_bam_file_path, reference_path=ref_sequence_path, output_dir=output_dir)
Exemplo n.º 13
0
def process_bam(bam_path, reference_path, output_dir=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save plots
    :return:
    """
    print("\n" + bam_path)

    if output_dir is None:
        output_dir = "variants/"

    # Make a subdirectory to contain everything
    datetime_string = FileManager.get_datetime_string()
    output_subdirectory = "variants_" + datetime_string
    output_dir = os.path.join(output_dir, output_subdirectory)
    FileManager.ensure_directory_exists(output_dir)

    bam_handler = BamHandler(bam_file_path=bam_path)
    fasta_handler = FastaHandler(reference_path)

    chromosome_names = fasta_handler.get_contig_names()
    chromosome_names = sort_chromosome_names(names=chromosome_names,
                                             prefix="chr")

    print("ref contig names:", chromosome_names)

    for chromosome_name in chromosome_names:
        print("Parsing alignments for ref contig:", chromosome_name)

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        start = 0
        stop = chromosome_length

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=start,
                                      stop=stop)

        inserts, deletes, mismatches = parse_reads(
            reads=reads,
            fasta_handler=fasta_handler,
            chromosome_name=chromosome_name)

        export_variants_to_csv(output_dir=output_dir,
                               chromosome_name=chromosome_name,
                               mismatches=mismatches,
                               inserts=inserts,
                               deletes=deletes,
                               merge=True)
Exemplo n.º 14
0
def main():
    output_dir = "output/" + "read_names_" + FileManager.get_datetime_string()
    output_filename = "read_names.txt"
    output_path = os.path.join(output_dir, output_filename)
    FileManager.ensure_directory_exists(output_dir)

    # STEP 1
    # Find union of read names within runnie and fastq files
    fastq_path = "/home/ryan/data/Nanopore/ecoli/guppy/r94_ec_guppy_rad2.fastq"
    runnie_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_all.out"

    # name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path,
    #                                                                fastq_path=fastq_path,
    #                                                                runnie_path=runnie_path)

    # STEP 2
    # Split sequence names into train/test partition
    name_intersection_path = "/home/ryan/code/runlength_analysis/output/read_names_2019_3_26_11_50_guppy_runnie_intersection/read_names.txt"
    names = read_names_from_file(name_intersection_path)
    names_train, names_test = partition_names(names)

    # STEP 3
    # Extract names and write to files
    runnie_train_subset_path = extract_runnie_reads_by_name(runnie_path=runnie_path,
                                                            output_dir=output_dir,
                                                            output_filename_suffix="train",
                                                            names=names_train)

    fastq_train_subset_path = extract_fastq_reads_by_name(fastq_path=fastq_path,
                                                          output_dir=output_dir,
                                                          output_filename_suffix="train",
                                                          names=names_train)

    runnie_test_subset_path = extract_runnie_reads_by_name(runnie_path=runnie_path,
                                                           output_dir=output_dir,
                                                           output_filename_suffix="test",
                                                           names=names_test)

    fastq_test_subset_path = extract_fastq_reads_by_name(fastq_path=fastq_path,
                                                         output_dir=output_dir,
                                                         output_filename_suffix="test",
                                                         names=names_test)

    # STEP 4
    # Verify
    name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path,
                                                                   fastq_path=fastq_train_subset_path,
                                                                   runnie_path=runnie_train_subset_path)

    name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path,
                                                                   fastq_path=fastq_test_subset_path,
                                                                   runnie_path=runnie_test_subset_path)
Exemplo n.º 15
0
def save_numpy_matrix(output_dir, filename, matrix):
    array_file_extension = ".npz"

    # ensure chromosomal directory exists
    if not os.path.exists(output_dir):
        FileManager.ensure_directory_exists(output_dir)

    output_path_prefix = os.path.join(output_dir, filename)

    output_path = output_path_prefix + array_file_extension

    # write numpy arrays
    numpy.savez_compressed(output_path, a=matrix)
Exemplo n.º 16
0
    def __init__(self):
        self.datetime_string = '-'.join(
            list(map(str,
                     datetime.datetime.now().timetuple()))[:-1])
        self.subdirectory_name = "training_" + self.datetime_string

        self.output_directory_name = "output/"
        self.directory = path.join(self.output_directory_name,
                                   self.subdirectory_name)

        self.n_checkpoints = 0

        FileManager.ensure_directory_exists(self.directory)
Exemplo n.º 17
0
def process_bam(bam_path, reference_path, bac_path, output_dir=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save stats
    :return:
    """
    if output_dir is None:
        output_dir = "stats/"

    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_handler = FastaHandler(reference_path)
    bac_fasta_handler = FastaHandler(bac_path)

    chromosome_names = ref_fasta_handler.get_contig_names()
    bac_names = bac_fasta_handler.get_contig_names()

    print(chromosome_names)
    print(bac_names)

    data_per_bac = defaultdict(list)

    for chromosome_name in chromosome_names:
        chromosome_length = ref_fasta_handler.get_chr_sequence_length(
            chromosome_name)

        start = 0
        stop = chromosome_length

        ref_fasta_handler = FastaHandler(reference_file_path=reference_path)
        bam_handler = BamHandler(bam_file_path=bam_path)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=start,
                                      stop=stop)

        read_data = parse_reads(reads=reads,
                                fasta_handler=ref_fasta_handler,
                                chromosome_name=chromosome_name)

        for data in read_data:
            data_per_bac[data[0]].append([chromosome_name] + data)

    # filtered_data = filter_supplementaries_by_largest(data_per_bac)
    filtered_data = aggregate_bac_data(data_per_bac)

    export_bac_data_to_csv(read_data=filtered_data,
                           output_dir=output_dir,
                           bam_path=bam_path)
Exemplo n.º 18
0
def process_bam(bam_path,
                reference_path,
                output_dir=None,
                centromere_table_path=None,
                gap_table_path=None,
                segdup_table_path=None,
                max_threads=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save plots
    :return:
    """
    print("\n" + bam_path)

    if max_threads is None:
        max_threads = max(1, cpu_count() - 2)

    if output_dir is None:
        output_dir = "plots/"

    process_manager = Manager()
    genome_data = process_manager.list()

    FileManager.ensure_directory_exists(output_dir)

    fasta_handler = FastaHandler(reference_path)

    chromosome_names = fasta_handler.get_contig_names()

    arguments = list()

    for chromosome_name in chromosome_names:
        arguments.append([
            bam_path, reference_path, chromosome_name, output_dir,
            centromere_table_path, gap_table_path, segdup_table_path,
            genome_data
        ])

    if len(arguments) < max_threads:
        max_threads = len(arguments)

    print("Using %d threads..." % max_threads)

    with Pool(processes=max_threads) as pool:
        pool.starmap(get_chromosome_data, arguments)

    export_genome_summary_to_csv(bam_path=bam_path,
                                 output_dir=output_dir,
                                 genome_data=genome_data)
Exemplo n.º 19
0
def process_bam(bam_path, reference_path, max_threads, output_dir=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save stats
    :return:
    """
    if output_dir is None:
        output_dir = "stats/"

    if max_threads is None:
        max_threads = max(1, cpu_count() - 2)

    process_manager = Manager()
    genome_data = process_manager.list()

    FileManager.ensure_directory_exists(output_dir)

    fasta_handler = FastaHandler(reference_path)

    chromosome_names = fasta_handler.get_contig_names()

    arguments = list()

    for chromosome_name in chromosome_names:
        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        start = 0
        stop = chromosome_length

        arguments.append([
            genome_data, reference_path, chromosome_name, start, stop,
            output_dir, bam_path
        ])

    if len(arguments) < max_threads:
        print("Fewer jobs than threads")
        max_threads = len(arguments)

    print("Using %d threads..." % max_threads)

    with Pool(processes=max_threads) as pool:
        pool.starmap(get_chromosome_stats, arguments)

    print("genome_data", genome_data)

    export_genome_summary_to_csv(bam_path=bam_path,
                                 output_dir=output_dir,
                                 genome_data=genome_data)
Exemplo n.º 20
0
def main(reads_file_path, genome_size=None, output_dir=None):
    if output_dir is None:
        output_dir = "./"
    else:
        FileManager.ensure_directory_exists(output_dir)

    if genome_size is None:
        genome_size = "3g"
        print(
            "WARNING: genome size flag not specified, defaulting to human size (3g)"
        )

    assembly_sequence_path = assemble_wtdbg2(output_dir=output_dir,
                                             input_file_path=reads_file_path,
                                             genome_size=genome_size)
Exemplo n.º 21
0
def plot_kernels_and_column_frequencies(kernel_sums,
                                        passing_indices,
                                        column_frequencies,
                                        slice_range=None,
                                        save=False,
                                        output_dir=None,
                                        filename=None):
    if slice_range is not None:
        kernel_sums = kernel_sums[:, slice_range[0]:slice_range[1]]
        passing_indices = passing_indices[:, slice_range[0]:slice_range[1]]
        column_frequencies = column_frequencies[:,
                                                slice_range[0]:slice_range[1]]

        kernel_sums.reshape(1, kernel_sums.shape[1])
        passing_indices.reshape(1, passing_indices.shape[1])
        column_frequencies.reshape(column_frequencies.shape[0],
                                   column_frequencies.shape[1])

    fig, axes = pyplot.subplots(nrows=3, sharex=True)
    fig.set_size_inches(16, 4)
    axes[0].imshow(passing_indices)
    axes[1].imshow(kernel_sums)
    axes[2].imshow(column_frequencies)

    axes[0].set_ylabel("Thresholded")
    axes[1].set_ylabel("Convolution")
    axes[2].set_ylabel("Frequencies")

    axes[0].set_yticklabels([])
    axes[1].set_yticklabels([])
    axes[2].set_yticklabels([])

    axes[0].set_yticks([])
    axes[1].set_yticks([])
    axes[2].set_yticks([])

    if save:
        FileManager.ensure_directory_exists(output_dir)
        filename = filename + "_kernels.png"
        path = os.path.join(output_dir, filename)
        pyplot.savefig(path)

    else:
        pyplot.show()

    pyplot.close()
Exemplo n.º 22
0
def main(ref_sequence_path,
         reads_sequence_path,
         max_threads=None,
         output_dir=None,
         minimap_preset="map-ont",
         k=15):
    if output_dir is None:
        output_dir = "./"
    else:
        FileManager.ensure_directory_exists(output_dir)

    reads_vs_ref_bam_path = align_minimap(
        output_dir=output_dir,
        ref_sequence_path=ref_sequence_path,
        reads_sequence_path=reads_sequence_path,
        preset=minimap_preset,
        max_threads=max_threads,
        k=k)
Exemplo n.º 23
0
def main():
    matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_shasta_human_chr1_GM24143/frequency_matrices_genomic_2019_4_23_21_38_57_361118.csv"
    output_dir = "output/runlength_matrix_shasta_human_chr1_GM24143/pseudocounts/"
    output_filename_prefix = "probability_matrices_GM24143_chr1_shasta_pseudocount_"

    FileManager.ensure_directory_exists(output_dir)

    matrix = load_base_length_matrix_from_csv(path=matrix_path, max_runlength=50)

    pseudocounts = [1, 4, 8, 16]

    for pseudocount in pseudocounts:
        filename = output_filename_prefix + str(pseudocount) + ".csv"
        save_nondirectional_frequency_matrices_as_delimited_text(output_dir=output_dir,
                                                                 frequency_matrices=matrix,
                                                                 chromosome_name="genomic",
                                                                 log_normalize=True,
                                                                 filename=filename,
                                                                 pseudocount=pseudocount,
                                                                 plot=False)
Exemplo n.º 24
0
def write_joint_distribution_to_file(distribution, output_dir):
    FileManager.ensure_directory_exists(output_dir)

    datetime_string = FileManager.get_datetime_string()

    filename_prefix = "joint_distribution"
    filename = filename_prefix + "_" + datetime_string + ".tsv"
    path = os.path.join(output_dir, filename)

    with open(path, 'w') as file:
        writer = csv.writer(file, delimiter="\t")

        for pair in sorted(distribution.keys()):
            line = [
                pair[0][0], pair[0][1], pair[1][0], pair[1][1],
                distribution[pair]
            ]

            writer.writerow(line)

    return path
Exemplo n.º 25
0
def test_window(bam_file_path,
                reference_file_path,
                chromosome_name,
                window,
                output_dir,
                save_data=True,
                print_results=False):
    """
    Run the pileup generator for a single specified window
    :param bam_file_path:
    :param reference_file_path:
    :param chromosome_name:
    :param window:
    :return:
    """
    bam_handler = BamHandler(bam_file_path)
    fasta_handler = FastaHandler(reference_file_path)

    pileup_start = window[0]
    pileup_end = window[1]  # add random variation here ?

    ref_sequence, read_ids, sequences = get_aligned_segments(
        fasta_handler=fasta_handler,
        bam_handler=bam_handler,
        chromosome_name=chromosome_name,
        pileup_start=pileup_start,
        pileup_end=pileup_end)

    if print_results:
        print_segments(ref_sequence, sequences)

    if save_data:
        filename = "test_" + str(pileup_start) + ".fasta"
        output_path = os.path.join(output_dir, filename)

        if not os.path.exists(output_dir):
            FileManager.ensure_directory_exists(output_dir)

        fasta_writer = FastaWriter(output_path)
        fasta_writer.write_sequences(sequences)
Exemplo n.º 26
0
def main():
    # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta"
    # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out"

    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    runlength_path = "/home/ryan/code/runlength_analysis/data/runnie_subset_test_flipflop_regional_0to10k.out"

    pileup_start = 6000
    pileup_end = 6050

    output_parent_dir = "output/"
    output_dir = "runlength_pileup_test_" + FileManager.get_datetime_string()
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename)

    assembly_fasta_filename_prefix = ".".join(os.path.basename(runlength_path).split(".")[:-1])
    runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta"
    runlength_assembly_fasta_path = os.path.join(output_dir, runlength_assembly_fasta_filename)

    handler = RunlengthHandler(runlength_path)

    reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True)

    read_data = dict()

    for r, read in enumerate(reads):
        read_data[read.id] = read

    print("\nRLE encoding reference sequence...")

    runlength_ref_sequences = runlength_encode_fasta(fasta_sequence_path=ref_fasta_path)

    assembly_vs_ref_bam_path = align_as_RLE(runlength_reference_path=runlength_ref_fasta_path,
                                            runlength_ref_sequences=runlength_ref_sequences,
                                            runlength_read_path=runlength_assembly_fasta_path,
                                            runlength_read_sequences=read_data,
                                            output_dir=output_dir)

    bam_handler = BamHandler(assembly_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_scales, aligned_shapes, reversal_statuses = \
        get_aligned_segments(fasta_handler=fasta_handler,
                             bam_handler=bam_handler,
                             chromosome_name=chromosome_name,
                             pileup_start=pileup_start,
                             pileup_end=pileup_end,
                             runlength_ref_sequences=runlength_ref_sequences,
                             read_data=read_data)

    sequence_encoding = list()
    scale_encoding = list()
    shape_encoding = list()
    modes_encoding = list()

    print(len(aligned_sequences.keys()))

    print("REF\t", "".join(aligned_ref_sequence))
    for read_id in aligned_sequences.keys():
        print("READ\t%s\t%s" % (read_id, "".join(aligned_sequences[read_id])))
        sequence_encoding.append(list(map(get_encoding, aligned_sequences[read_id])))
        scale_encoding.append(aligned_scales[read_id])
        shape_encoding.append(aligned_shapes[read_id])
        modes_encoding.append(list(map(map_parameters_to_mode, zip(aligned_scales[read_id], aligned_shapes[read_id]))))

    sequence_encoding = -numpy.array(sequence_encoding, dtype=numpy.float)
    scale_encoding = numpy.array(scale_encoding, dtype=numpy.float)
    shape_encoding = numpy.array(shape_encoding, dtype=numpy.float)
    modes_encoding = numpy.array(modes_encoding, dtype=numpy.float)

    plot_runlength_pileup(sequences=sequence_encoding,
                          scales=scale_encoding,
                          shapes=shape_encoding,
                          modes=modes_encoding)
Exemplo n.º 27
0
def main(reference_file_path):
    input_prefix_name = os.path.basename(reference_file_path).split("/")[-1].split(".")[0]
    output_dir = os.path.join("output/ref_run_lengths/", input_prefix_name)
    filename_prefix = "ref_runlength_distribution"

    FileManager.ensure_directory_exists(output_dir)

    fasta_handler = FastaHandler(reference_file_path)
    contig_names = fasta_handler.get_contig_names()

    print(contig_names)
    print(sorted([(x,fasta_handler.get_chr_sequence_length(x)) for x in contig_names],key=lambda x: x[1]))

    all_counts = defaultdict(lambda: Counter())
    raw_counts_AT = list()
    raw_counts_GC = list()

    sys.stderr.write("reading fasta file...\n")
    sys.stderr.flush()

    max_count = 100
    step = 1
    c = 0
    for chromosome_name in contig_names:
        # if len(contig_names) > 1:
        #     if not chromosome_name.startswith("chr") or "alt" in chromosome_name or "v" in chromosome_name:
        #         print("WARNING: SKIPPING CHROMOSOME %s" % chromosome_name)
        #         continue

        # if c == 1:
        #     break
        c += 1

        sys.stderr.write("Parsing chromosome %s\n" % chromosome_name)
        sys.stderr.flush()

        chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

        reference_sequence = fasta_handler.get_sequence(chromosome_name=chromosome_name, start=0, stop=chromosome_length)
        character_counts = count_runlength_per_character(reference_sequence)

        figure, axes = pyplot.subplots(nrows=len(character_counts.keys()), sharex=True)
        figure.set_size_inches(6,12)

        for k,key in enumerate(character_counts.keys()):
            counts = character_counts[key]
            counter = Counter(counts)
            all_counts[key] += counter

            if key in {"C","G"}:
                raw_counts_GC += counts

            if key in {"A","T"}:
                raw_counts_AT += counts

            plot_counts_as_histogram(axes=axes[k], counts=counts, max_count=max_count, step=step)

            axes[k].set_ylabel(str(key))
            axes[k].set_ylim([-0.5,10])

        axes[0].set_title(chromosome_name)

        filename = filename_prefix + "_" + chromosome_name + ".png"
        file_path = os.path.join(output_dir, filename)
        figure.savefig(file_path)
        # pyplot.show()
        pyplot.close()

    figure, axes = pyplot.subplots(nrows=2)

    filename = filename_prefix + "_genomic.png"
    file_path = os.path.join(output_dir, filename)

    plot_counts_as_histogram(axes=axes[0], counts=raw_counts_AT, max_count=max_count, step=step)
    plot_counts_as_histogram(axes=axes[1], counts=raw_counts_GC, max_count=max_count, step=step)
    axes[0].set_ylabel("AT Log10 Frequency")
    axes[1].set_ylabel("GC Log10 Frequency")

    figure.savefig(file_path)
    # pyplot.show()
    pyplot.close()

    print_all_counts_as_shasta_matrix(all_counts, max_count=50)
    print_all_counts(all_counts, output_dir)
def generate_ngx_plot(assembly_contigs,
                      input_dir,
                      genome_size=None,
                      y_max=180,
                      title="NGx",
                      figure=None,
                      axes=None):
    samples = [
        "03492", "03098", "02723", "02080", "02055", "01243", "01109", "00733",
        "24385", "24149", "24143", "CHM13", "hg38_no_alts"
    ]

    colors = [
        (175 / 256.0, 48 / 256.0, 51 / 256.0),  # red
        (224 / 256.0, 99 / 256.0, 58 / 256.0),  # orange
        (215 / 256.0, 219 / 256.0, 84 / 256.0),  # yellow
        (110 / 256.0, 170 / 256.0, 100 / 256.0),  # light green
        (80 / 256.0, 180 / 256.0, 150 / 256.0),  # green
        (100 / 256.0, 189 / 256.0, 197 / 256.0),  # green-blue
        (0 / 256.0, 170 / 256.0, 231 / 256.0),  # turquoise
        (51 / 256.0, 87 / 256.0, 182 / 256.0),  # blue
        (37 / 256.0, 36 / 256.0, 93 / 256.0),  # indigo
        (95 / 256.0, 51 / 256.0, 139 / 256.0),  # purple
        (200 / 256.0, 53 / 256.0, 93 / 256.0),  # pink
        (224 / 256.0, 99 / 256.0, 58 / 256.0),
        (110 / 256.0, 170 / 256.0, 100 / 256.0)
    ]

    alphas = [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 1.0, 0.3, 0.3, 0.3, 1.0, 1.0]
    zorders = [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1]

    labels = {}

    # ---------------------------------------------------------------------------

    # samples = ["shasta", "wtdbg2", "canu", "flye"]
    #
    # colors = [(0.890,0.120,0.031),
    #           (0.999,0.696,0.031),  # (112/256, 37/256, 163/256)
    #           (0.039,0.463,0.58),
    #           (0.024,0.69,0.224)]
    #
    # zorders = [1,0,0,0]
    # alphas = [1,0.9,1,1]
    #
    # labels = {}

    # ---------------------------------------------------------------------------
    #
    # samples = ["shasta", "hifi"]
    #
    # colors = [(0.933,0.153,0.031),
    #           (112/256, 37/256, 163/256),
    #           (0.039,0.463,0.58),
    #           (0.024,0.69,0.224)]
    #
    # zorders = [1,1]
    # alphas = [1,1]
    #
    # labels = {}

    # ---------------------------------------------------------------------------

    # samples = ["assembly_GM24385",
    #            "assembly_HG00733",
    #            "scaffold_GM24385",
    #            "scaffold_HG00733"]
    #
    # labels = {}
    #
    # colors = [(51/256.0,    87/256.0,   182/256.0),     # blue
    #           (51/256.0,    87/256.0,   182/256.0),     # green-blue
    #           # (200/256.0,   200/256.0,  200/256.0),     # grey
    #           (100/256.0,   189/256.0,  197/256.0),      # orange
    #           (100/256.0,   189/256.0,  197/256.0)]  # light green
    #
    # zorders = [1,1,1,1]
    # alphas = [0.5,1,0.5,1]

    # ---------------------------------------------------------------------------

    if genome_size is None:
        print("WARNING: genome_size unspecified, using human as default")
        genome_size = 3.23 * 1000**3

    if y_max is None:
        print("WARNING: y_max unspecified, using 180Mbp as default")
        y_max = 180

    if figure is None and axes is None:
        figure = pyplot.figure()
        axes = pyplot.axes()

    legend_names = list()
    for path, contigs in sorted(assembly_contigs.items(), key=lambda x: x[0]):
        print("Plotting assembly: %s" % path)

        sample_matched = False
        for name in samples:
            if name.lower() in path.lower():
                sample_index = samples.index(name)
                color = colors[sample_index]
                alpha = alphas[sample_index]
                zorder = zorders[sample_index]
                sample_name = name
                sample_matched = True

        if not sample_matched:
            print("ERROR: color not found for %s" % path)
            sample_index = 0
            color = colors[sample_index]
            alpha = alphas[sample_index]
            zorder = zorders[sample_index]
            sample_name = os.path.basename(path).split(".")[0]

        if sample_name in labels:
            label = labels[sample_name]
        else:
            label = sample_name

        x1 = 0
        y_prev = None

        x_coords = list()
        y_coords = list()

        for contig in contigs:
            y = contig[LENGTH]
            width = contig[LENGTH] / genome_size
            x2 = x1 + width

            if y_prev is not None:
                x_coords.extend([x1, x1])
                y_coords.extend([y_prev, y])

            x_coords.extend([x1, x2])
            y_coords.extend([y, y])

            x1 = x2
            y_prev = y

        if y_coords[-1] != 0:
            y_coords.append(0)
            x_coords.append(x_coords[-1])

        dashes = [1, 0, 1, 0]

        if "hifi" in path.lower():
            label = "Canu CCS"

        if "shasta" in path:
            label = "Shasta Nanopore"

        if label not in legend_names:
            legend_names.append(label)

        axes.plot(x_coords,
                  y_coords,
                  color=color,
                  alpha=alpha,
                  zorder=zorder,
                  dashes=dashes,
                  linewidth=0.6)

    axes.legend(legend_names)

    axes.axvline(0.5, linestyle="--", alpha=0.3, linewidth=0.7, zorder=-1)

    # max_size = y_max
    #
    # step_size = 20
    # if step_size >= y_max:
    #     step_size = 1
    #
    # scale = 1_000_000
    #
    # axes.set_xlim([0,1])
    # axes.set_ylim([0,max_size*scale])
    # axes.set_yticks(numpy.arange(0,max_size+step_size,step_size)*scale)
    # axes.set_yticklabels(numpy.arange(0,max_size+step_size,step_size))

    axes.set_title(title)
    axes.set_ylabel("Contig/scaffold size (Mbp)")
    axes.set_xlabel("Cumulative coverage")

    FileManager.ensure_directory_exists("output")

    output_dir = "output/"
    filename = input_dir.rstrip("/").split(
        "/")[-1] + "_" + FileManager.get_datetime_string()
    file_path = os.path.abspath(os.path.join(output_dir, filename))

    print("SAVING FIGURE: %s" % file_path)
    figure.savefig(file_path + ".png", dpi=300)
    figure.savefig(file_path + ".pdf", dpi=300)

    pyplot.close()
Exemplo n.º 29
0
def main():
    # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_ref.fasta"
    # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_reads.fasta"
    # matrix_path = "/home/ryan/code/runnie_parser/output/runlength_matrix_from_assembly_contigs_2019_3_19_13_29_14_657613/probability_matrices_2019_3_19_13_29_19_362916.csv"

    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    read_fasta_path = "/home/ryan/code/runlength_analysis/data/sequence_subset_ecoli_guppy-runnie_60x_test.fastq"
    matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_sequence_2019_4_5_15_29_28_403950/probability_matrices_2019_4_5_15_35_57_920301.csv"

    output_parent_dir = "output/"
    output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string(
    )
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(
        os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir,
                                            runlength_ref_fasta_filename)

    read_fasta_filename_prefix = ".".join(
        os.path.basename(read_fasta_path).split(".")[:-1])
    runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta"
    runlength_read_fasta_path = os.path.join(output_dir,
                                             runlength_read_fasta_filename)

    runlength_ref_sequences = runlength_encode_fasta(
        fasta_sequence_path=ref_fasta_path)
    runlength_read_sequences = runlength_encode_fasta(
        fasta_sequence_path=read_fasta_path)

    read_vs_ref_bam_path = align_as_RLE(
        runlength_reference_path=runlength_ref_fasta_path,
        runlength_ref_sequences=runlength_ref_sequences,
        runlength_read_path=runlength_read_fasta_path,
        runlength_read_sequences=runlength_read_sequences,
        output_dir=output_dir)

    bam_handler = BamHandler(read_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    windows = chunk_chromosome_coordinates(chromosome_length=chromosome_length,
                                           chunk_size=1000)

    # Initialize empty confusion matrices
    total_confusion = get_runlength_confusion([], [], 10)
    total_modal_confusion = get_runlength_confusion([], [], 10)

    length_classifier = RunlengthClassifier(matrix_path)

    print("reading BAM")
    for pileup_start, pileup_end in windows[:10]:
        print("window", pileup_start, pileup_end)

        sys.stderr.write("\r%s" % pileup_start)
        aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_lengths, reversal_statuses = \
            get_aligned_segments(fasta_handler=fasta_handler,
                                 bam_handler=bam_handler,
                                 chromosome_name=chromosome_name,
                                 pileup_start=pileup_start,
                                 pileup_end=pileup_end,
                                 runlength_ref_sequences=runlength_ref_sequences,
                                 read_data=runlength_read_sequences)

        sequence_encoding = list()
        length_encoding = list()
        reversal_encoding = list()

        # No reads here?
        if len(aligned_sequences) == 0:
            continue

        # print("REF\t", "".join(aligned_ref_sequence))
        for read_id in aligned_sequences.keys():
            # print("READ\t","".join(aligned_sequences[read_id]))
            sequence_encoding.append(
                list(map(get_encoding, aligned_sequences[read_id])))
            length_encoding.append(aligned_lengths[read_id])
            reversal_encoding.append(reversal_statuses[read_id])

        ref_sequence_encoding = [list(map(get_encoding, aligned_ref_sequence))]
        ref_lengths_encoding = [aligned_ref_lengths]

        ref_sequence_encoding = numpy.array(ref_sequence_encoding,
                                            dtype=numpy.int)
        ref_length_encoding = numpy.array(ref_lengths_encoding,
                                          dtype=numpy.int)
        sequence_encoding = numpy.array(sequence_encoding, dtype=numpy.int)
        length_encoding = numpy.array(length_encoding, dtype=numpy.float)
        reversal_encoding = numpy.array(reversal_encoding, dtype=numpy.bool)

        ref_sequence_encoding = numpy.atleast_2d(ref_sequence_encoding)
        ref_length_encoding = numpy.atleast_2d(ref_length_encoding)
        sequence_encoding = numpy.atleast_2d(sequence_encoding)
        length_encoding = numpy.atleast_2d(length_encoding)

        # plot_runlength_pileup(sequences=-sequence_encoding,
        #                       lengths=length_encoding,
        #                       ref_sequence=-ref_sequence_encoding,
        #                       ref_lengths=ref_length_encoding)

        consensus_sequence, consensus_lengths = \
            get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier,
                                                         sequence_encoding=sequence_encoding,
                                                         length_encoding=length_encoding,
                                                         reversal_encoding=reversal_encoding)

        modal_consensus_sequence, modal_consensus_lengths = \
            get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier,
                                                         sequence_encoding=sequence_encoding,
                                                         length_encoding=length_encoding,
                                                         reversal_encoding=reversal_encoding,
                                                         bayesian=False)

        print()
        print("PREDICTED\t", consensus_lengths[:10])
        print("TRUE\t\t", aligned_ref_lengths[:10])

        confusion = get_runlength_confusion(
            true_lengths=aligned_ref_lengths,
            predicted_lengths=consensus_lengths,
            max_length=10)

        total_confusion += confusion

        modal_confusion = get_runlength_confusion(
            true_lengths=aligned_ref_lengths,
            predicted_lengths=modal_consensus_lengths,
            max_length=10)

        total_modal_confusion += modal_confusion

        # except Exception as e:
        #     print(e)
        #     continue
    print()

    accuracy = get_accuracy_from_confusion_matrix(total_confusion)

    print("Bayes:", accuracy)

    accuracy = get_accuracy_from_confusion_matrix(total_modal_confusion)

    print("No Bayes", accuracy)

    plot_filename = "confusion.png"
    plot_path = os.path.join(output_dir, plot_filename)

    figure = pyplot.figure()
    axes = pyplot.axes()
    axes.set_xlabel("Predicted")
    axes.set_ylabel("True")

    pyplot.imshow(numpy.log10(total_confusion))
    pyplot.show()
    figure.savefig(plot_path)

    pyplot.close()

    plot_filename = "modal_confusion.png"
    plot_path = os.path.join(output_dir, plot_filename)

    figure = pyplot.figure()
    axes = pyplot.axes()
    axes.set_xlabel("Predicted")
    axes.set_ylabel("True")

    pyplot.imshow(numpy.log10(total_modal_confusion))
    pyplot.show()
    figure.savefig(plot_path)

    pyplot.close()
Exemplo n.º 30
0
def main():
    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    read_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/guppy/subsampled/11-29/r94_ec_rad2.30x-30kb.fasta"

    # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/v2/rad2_pass_runnie_0_1_10_11_12_13_v2.fa"
    # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_14_29_ecoli_wg_guppy_NO_BAYES/Assembly.fasta"
    # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_15_40_ecoli_wg_guppy_BAYES/Assembly.fasta"
    # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/rad2_pass_runnie_0_v2.fa"

    # ---- TEST DATA ----
    # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_ref.fasta"
    # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_reads.fasta"
    # -------------------

    output_parent_dir = "output/"
    output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string(
    )
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(
        os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir,
                                            runlength_ref_fasta_filename)

    read_fasta_filename_prefix = ".".join(
        os.path.basename(read_fasta_path).split(".")[:-1])
    runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta"
    runlength_read_fasta_path = os.path.join(output_dir,
                                             runlength_read_fasta_filename)

    sys.stderr.write("RL encoding fasta...\n")

    runlength_ref_sequences = runlength_encode_fasta(
        fasta_sequence_path=ref_fasta_path)
    runlength_read_sequences = runlength_encode_fasta(
        fasta_sequence_path=read_fasta_path)

    sys.stderr.write("Aligning RLE fasta...\n")

    read_vs_ref_bam_path = align_as_RLE(
        runlength_reference_path=runlength_ref_fasta_path,
        runlength_ref_sequences=runlength_ref_sequences,
        runlength_read_path=runlength_read_fasta_path,
        runlength_read_sequences=runlength_read_sequences,
        output_dir=output_dir)

    bam_handler = BamHandler(read_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    print(chromosome_length)

    sequences, lengths = get_read_segments(
        fasta_handler=fasta_handler,
        bam_handler=bam_handler,
        chromosome_name=chromosome_name,
        pileup_start=100000,
        pileup_end=100000 + 100,
        runlength_ref_sequences=runlength_ref_sequences,
        read_data=runlength_read_sequences)

    for k, key in enumerate(sequences):
        print(key)
        print(sequences[key][:10])
        print(lengths[key][:10])