Exemplo n.º 1
0
def generate_runlength_frequency_matrix(runlength_ref_sequence_path, assembly_vs_ref_bam_path,
                                        runlength_ref_sequences, runlength_read_data):
    """
    Take an alignment of RLE sequences (in BAM format, using minimap as an aligner) in combination with the series of
    lengths (which have been excluded from the BAM) and aligned observations from Benedicts' model to generate a matrix
    of true vs observed lengths.

    :param runlength_ref_sequence_path:
    :param assembly_vs_ref_bam_path:
    :param runlength_ref_sequences:
    :param runlength_read_data:
    :return:
    """
    for chromosome_name in runlength_ref_sequences:
        shape = [2,4,MAX_RUNLENGTH+1,MAX_RUNLENGTH+1]
        matrix = numpy.zeros(shape, dtype=numpy.float64)

        bam_handler = BamHandler(assembly_vs_ref_bam_path)
        fasta_handler = FastaHandler(runlength_ref_sequence_path)

        chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=0, stop=chromosome_length)

        n_reads = parse_reads(chromosome_name=chromosome_name,
                              fasta_handler=fasta_handler,
                              reads=reads,
                              complete_ref_runlengths=runlength_ref_sequences[chromosome_name][LENGTHS],
                              runlength_read_data=runlength_read_data,
                              matrix=matrix)

        if n_reads > 0:
            yield (chromosome_name, matrix)
        else:
            sys.stderr.write("No reads found for chromosome: %s\n" % chromosome_name)
def get_aligned_contig_lengths(bam_path, aligned_assembly_contigs):
    bam_handler = BamHandler(bam_file_path=bam_path)

    reads = bam_handler.get_reads(chromosome_name=None, start=None, stop=None)

    aligned_lengths = list()

    n_secondary = 0
    for read in reads:
        if read.is_secondary:
            n_secondary += 1

        if read.mapping_quality > 5 and not read.is_secondary:
            read_id = read.query_name
            ref_alignment_start = read.reference_start
            ref_alignment_stop = get_read_stop_position(read)
            ref_length = ref_alignment_stop - ref_alignment_start

            aligned_lengths.append([read_id, ref_length])

            print(read_id, ref_length)

    aligned_lengths = sorted(aligned_lengths,
                             key=lambda x: x[LENGTH],
                             reverse=True)

    aligned_assembly_contigs[bam_path] = aligned_lengths
Exemplo n.º 3
0
def process_bam(bam_path, reference_path):
    """
    Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :return:
    """
    print("\n" + bam_path + "\n")

    output_dir = "plots/"
    FileManager.ensure_directory_exists(output_dir)

    bam_handler = BamHandler(bam_file_path=bam_path)
    fasta_handler = FastaHandler(reference_path)

    chromosome_names = ["gi"]

    for chromosome_name in chromosome_names:
        chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

        start = 0
        stop = chromosome_length

        reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop)

        read_data = parse_reads(reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name)

        print("chromosome_name:\t", chromosome_name)
        print("chromosome_length:\t", chromosome_length)
        for data in read_data:
            read_id, reversal_status, ref_alignment_start, alignment_length, read_length, contig_length, n_initial_clipped_bases, n_total_mismatches, n_total_deletes, n_total_inserts, identity = data
            print()
            print(read_id)
            print("reversed:\t", reversal_status)
            print("alignment_start:\t", ref_alignment_start)
            print("alignment_length:\t", alignment_length)
            print("n_initial_clipped_bases:", n_initial_clipped_bases)
            print("n_total_mismatches:\t", n_total_mismatches)
            print("n_total_deletes:\t", n_total_deletes)
            print("n_total_inserts:\t", n_total_inserts)
            print("identity:\t", identity)

        total_weighted_identity = sum([x[ALIGNMENT_LENGTH] * x[IDENTITY] for x in read_data])
        total_alignment_bases = sum([x[ALIGNMENT_LENGTH] for x in read_data])
        total_identity = total_weighted_identity/total_alignment_bases

        print("\nTOTAL IDENTITY:\t", total_identity)

        plot_contigs(output_dir=output_dir,
                     read_data=read_data,
                     chromosome_name=chromosome_name,
                     chromosome_length=chromosome_length,
                     total_identity=total_identity,
                     bam_path=bam_path,
                     y_min=-1,
                     y_max=4,
                     show=False)
Exemplo n.º 4
0
def generate_runlength_frequency_matrix(runlength_ref_sequence_path,
                                        read_vs_ref_bam_path,
                                        runlength_ref_sequences,
                                        runlength_read_sequences):
    """
    Take an alignment of RLE sequences (in BAM format, using minimap as an aligner) in combination with the series of
    lengths (which have been excluded from the BAM) and aligned observations from Benedicts' model to generate a matrix
    of true vs observed lengths.
    :param runlength_ref_sequence_path:
    :param read_vs_ref_bam_path:
    :return:
    """

    for chromosome_name in runlength_ref_sequences.keys():
        # allowed_chromosomes = {"chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10"}
        # allowed_chromosomes = {"chrX"}
        # if chromosome_name not in allowed_chromosomes:
        #     print("WARNING: SKIPPING CHROMOSOME %s BECAUSE NOT IN %s" % (chromosome_name, str(allowed_chromosomes)))
        #     continue

        # if not chromosome_name.startswith("chrX"):
        #     print("WARNING: SKIPPING CHROMOSOME %s BECAUSE NOT chrX" % chromosome_name)
        #     continue

        shape = [2, 4, MAX_RUNLENGTH + 1, MAX_RUNLENGTH + 1]
        matrix = numpy.zeros(shape, dtype=numpy.float64)

        # print(read_vs_ref_bam_path)

        bam_handler = BamHandler(read_vs_ref_bam_path)
        fasta_handler = FastaHandler(runlength_ref_sequence_path)

        # chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=None,
                                      stop=None)

        n_reads = parse_reads(
            chromosome_name=chromosome_name,
            fasta_handler=fasta_handler,
            reads=reads,
            complete_ref_runlengths=runlength_ref_sequences[chromosome_name]
            [LENGTHS],
            runlength_read_sequences=runlength_read_sequences,
            matrix=matrix)

        # plot_base_matrices(matrix=matrix, cutoff=40)

        if n_reads > 0:
            yield (chromosome_name, matrix)
        else:
            sys.stderr.write("No reads found for chromosome: %s\n" %
                             chromosome_name)
Exemplo n.º 5
0
def process_bam(bam_path, reference_path, output_dir=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save plots
    :return:
    """
    print("\n" + bam_path)

    if output_dir is None:
        output_dir = "variants/"

    # Make a subdirectory to contain everything
    datetime_string = FileManager.get_datetime_string()
    output_subdirectory = "variants_" + datetime_string
    output_dir = os.path.join(output_dir, output_subdirectory)
    FileManager.ensure_directory_exists(output_dir)

    bam_handler = BamHandler(bam_file_path=bam_path)
    fasta_handler = FastaHandler(reference_path)

    chromosome_names = fasta_handler.get_contig_names()
    chromosome_names = sort_chromosome_names(names=chromosome_names,
                                             prefix="chr")

    print("ref contig names:", chromosome_names)

    for chromosome_name in chromosome_names:
        print("Parsing alignments for ref contig:", chromosome_name)

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        start = 0
        stop = chromosome_length

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=start,
                                      stop=stop)

        inserts, deletes, mismatches = parse_reads(
            reads=reads,
            fasta_handler=fasta_handler,
            chromosome_name=chromosome_name)

        export_variants_to_csv(output_dir=output_dir,
                               chromosome_name=chromosome_name,
                               mismatches=mismatches,
                               inserts=inserts,
                               deletes=deletes,
                               merge=True)
Exemplo n.º 6
0
def process_bam(bam_path, reference_path, bac_path, output_dir=None):
    """
    Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels
    :param bam_path: path to a bam containing contigs aligned to a true reference
    :param reference_path: the true reference that contigs were aligned to
    :param output_dir: where to save stats
    :return:
    """
    if output_dir is None:
        output_dir = "stats/"

    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_handler = FastaHandler(reference_path)
    bac_fasta_handler = FastaHandler(bac_path)

    chromosome_names = ref_fasta_handler.get_contig_names()
    bac_names = bac_fasta_handler.get_contig_names()

    print(chromosome_names)
    print(bac_names)

    data_per_bac = defaultdict(list)

    for chromosome_name in chromosome_names:
        chromosome_length = ref_fasta_handler.get_chr_sequence_length(
            chromosome_name)

        start = 0
        stop = chromosome_length

        ref_fasta_handler = FastaHandler(reference_file_path=reference_path)
        bam_handler = BamHandler(bam_file_path=bam_path)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=start,
                                      stop=stop)

        read_data = parse_reads(reads=reads,
                                fasta_handler=ref_fasta_handler,
                                chromosome_name=chromosome_name)

        for data in read_data:
            data_per_bac[data[0]].append([chromosome_name] + data)

    # filtered_data = filter_supplementaries_by_largest(data_per_bac)
    filtered_data = aggregate_bac_data(data_per_bac)

    export_bac_data_to_csv(read_data=filtered_data,
                           output_dir=output_dir,
                           bam_path=bam_path)
Exemplo n.º 7
0
def generate_window_fasta(bam_file_path,
                          reference_file_path,
                          chromosome_name,
                          window,
                          output_dir,
                          exclude_loose_ends=True):
    """
    Run the pileup generator for a single specified window
    :param bam_file_path:
    :param reference_file_path:
    :param chromosome_name:
    :param window:
    :return:
    """
    bam_handler = BamHandler(bam_file_path)
    fasta_handler = FastaHandler(reference_file_path)

    pileup_start = window[0]
    pileup_end = window[1]  # add random variation here ?

    reads_found = True

    ref_sequence, read_ids, sequences, reversal_statuses = get_aligned_segments(
        fasta_handler=fasta_handler,
        bam_handler=bam_handler,
        chromosome_name=chromosome_name,
        pileup_start=pileup_start,
        pileup_end=pileup_end,
        include_ref=True,
        exclude_loose_ends=exclude_loose_ends)

    if sequences is not None:
        for sequence in sequences:
            print(len(sequence))

        print(len(ref_sequence))

        FileManager.ensure_directory_exists(output_dir)
        sequences_output_filename = '_'.join(
            [chromosome_name, str(window[0]),
             str(window[1])]) + ".fasta"
        sequences_output_path = os.path.join(output_dir,
                                             sequences_output_filename)
        fasta_writer = FastaWriter(sequences_output_path)
        fasta_writer.write_sequences(sequences)

        ref_output_filename = '_'.join(
            [chromosome_name,
             str(window[0]),
             str(window[1]), "ref"]) + ".fasta"
        ref_output_path = os.path.join(output_dir, ref_output_filename)
        fasta_writer = FastaWriter(ref_output_path)
        fasta_writer.write_sequences([ref_sequence])

        # print("saving sequences as fasta: ", sequences_output_path, ref_output_path)

    else:
        reads_found = False

    return reads_found
Exemplo n.º 8
0
def main(bam_file_path, cutoff, contig_name):
    # ---- GIAB E. Coli - (dev machine) ---------------------------------------
    # bam_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa"
    # bam_file_path = "/home/ryan/data/Nanopore/ecoli/flapppie/03_22_19_R941_gEcoli_first_410k_VS_refEcoli.sorted.bam"
    # -------------------------------------------------------------------------

    bam_handler = BamHandler(bam_file_path)
    reads = bam_handler.get_reads(chromosome_name=contig_name, start=None, stop=None)

    all_counts = defaultdict(lambda: Counter())

    sys.stderr.write("reading file...\n")
    sys.stderr.flush()

    c = 0
    for read in reads:
        if read.mapping_quality <= 5 or read.is_secondary or read.is_unmapped \
                or read.is_qcfail:
            continue

        c += 1

        if c % 100 == 0:
            sys.stderr.write("\rParsed %d reads" % c)

        if c > cutoff:
            break

        sequence = read.query_sequence

        # print(read.query_name)
        # print(len(sequence))
        # print(sequence[:10])

        character_counts = count_runlength_per_character(sequence)

        for character in character_counts:
            all_counts[character] += character_counts[character]

    sys.stderr.write("\n")

    for character in sorted(all_counts):
        print(">%s" % character)
        for length in sorted(all_counts[character].keys()):
            print(length, all_counts[character][length])
Exemplo n.º 9
0
def generate_runlength_frequency_matrix(runlength_ref_sequence_path,
                                        assembly_vs_ref_bam_path,
                                        runlength_ref_sequences,
                                        runlength_assembly_sequences):
    """
    Take an alignment of RLE sequences (in BAM format, using minimap as an aligner) in combination with the series of
    lengths (which have been excluded from the BAM) and aligned observations from Benedicts' model to generate a matrix
    of true vs observed lengths.
    :param chromosome_name:
    :param runlength_ref_sequence_path:
    :param assembly_vs_ref_bam_path:
    :param runlength_ref_sequence:
    :param runlength_ref_lengths:
    :param observations:
    :return:
    """

    for chromosome_name in runlength_ref_sequences:
        shape = [2, 4, MAX_RUNLENGTH + 1, MAX_RUNLENGTH + 1]
        matrix = numpy.zeros(shape, dtype=numpy.float64)

        # print(assembly_vs_ref_bam_path)

        bam_handler = BamHandler(assembly_vs_ref_bam_path)
        fasta_handler = FastaHandler(runlength_ref_sequence_path)

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=0,
                                      stop=chromosome_length)

        parse_reads(
            chromosome_name=chromosome_name,
            fasta_handler=fasta_handler,
            reads=reads,
            complete_ref_runlengths=runlength_ref_sequences[chromosome_name]
            [LENGTHS],
            runlength_assembly_sequences=runlength_assembly_sequences,
            matrix=matrix)

        # plot_base_matrices(matrix=matrix, cutoff=40)

        yield matrix
Exemplo n.º 10
0
def parse_bam(bam_path, reference_path):
    """
    Iterate a BAM file and count summary stats from that file
    :param bam_path:
    :param reference_path:
    :return:
    """
    fasta_handler = FastaHandler(reference_path)
    chromosome_names = fasta_handler.get_contig_names()

    chromosomal_cigar_counts = defaultdict(lambda: defaultdict(int))

    n_alignments = 0
    n_primary = 0
    n_supplementary = 0
    n_secondary = 0

    map_qualities = IterativeHistogram(start=0, stop=60, n_bins=6)

    for chromosome_name in chromosome_names:
        bam_handler = BamHandler(bam_path)

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                      start=0,
                                      stop=chromosome_length)

        chromosomal_cigar_counts, \
        n_alignments, \
        n_primary, \
        n_supplementary, \
        n_secondary, \
        map_qualities = count_cigar_operations(reads=reads,
                                               chromosome_name=chromosome_name,
                                               chromosomal_cigar_counts=chromosomal_cigar_counts,
                                               n_alignments=n_alignments,
                                               n_primary=n_primary,
                                               n_supplementary=n_supplementary,
                                               n_secondary=n_secondary,
                                               map_qualities=map_qualities)

    return chromosomal_cigar_counts, n_alignments, n_primary, n_supplementary, n_secondary, map_qualities
Exemplo n.º 11
0
def generate_collapsed_data(bam_file_path,
                            reference_file_path,
                            vcf_path,
                            bed_path,
                            chromosome_name,
                            start_position,
                            end_position,
                            generate_from_vcf=False):
    """
    Generate pileups from BAM data, and collapse sequences to have no explicitly repeated characters. Additionally
    encode a repeat channel that describes the number of repeats observed per base.
    :param bam_file_path:
    :param reference_file_path:
    :param vcf_path:
    :param chromosome_name:
    :param start_position:
    :param end_position:
    :return:
    """
    bam_handler = BamHandler(bam_file_path)
    fasta_handler = FastaHandler(reference_file_path)

    if generate_from_vcf:
        chromosomal_windows = get_variant_windows(
            vcf_path=vcf_path,
            chromosome_name=chromosome_name,
            start_position=start_position,
            end_position=end_position)

    else:
        chromosomal_windows = get_non_variant_windows(
            vcf_path=vcf_path,
            bed_path=bed_path,
            chromosome_name=chromosome_name,
            start_position=start_position,
            end_position=end_position)

    for chromosome_name in chromosomal_windows:
        for w, window in enumerate(chromosomal_windows[chromosome_name]):
            pileup_start = window[0]
            pileup_end = window[1]  # add random variation here

            print(pileup_start, pileup_end)

            ref_sequence, read_ids, sequences = get_aligned_segments(
                fasta_handler=fasta_handler,
                bam_handler=bam_handler,
                chromosome_name=chromosome_name,
                pileup_start=pileup_start,
                pileup_end=pileup_end)

            character_sequences, character_counts = collapse_repeats(sequences)
            print_collapsed_segments(character_sequences, character_counts)

            if w == 0:
                exit()
Exemplo n.º 12
0
def generate_data(bam_file_path,
                  reference_file_path,
                  vcf_path,
                  bed_path,
                  chromosome_name,
                  start_position,
                  end_position,
                  generate_from_vcf=False):
    """
    Generate pileup for read segments aligned between two genomic coordinates
    :param bam_file_path:
    :param reference_file_path:
    :param vcf_path:
    :param chromosome_name:
    :param start_position:
    :param end_position:
    :return:
    """
    bam_handler = BamHandler(bam_file_path)
    fasta_handler = FastaHandler(reference_file_path)

    if generate_from_vcf:
        chromosomal_windows = get_variant_windows(
            vcf_path=vcf_path,
            chromosome_name=chromosome_name,
            start_position=start_position,
            end_position=end_position)

    else:
        chromosomal_windows = get_non_variant_windows(
            vcf_path=vcf_path,
            bed_path=bed_path,
            chromosome_name=chromosome_name,
            start_position=start_position,
            end_position=end_position)

    for chromosome_name in chromosomal_windows:
        for w, window in enumerate(chromosomal_windows[chromosome_name]):
            pileup_start = window[0]
            pileup_end = window[1]  # add random variation here

            print(pileup_start, pileup_end)

            ref_sequence, read_ids, sequences = get_aligned_segments(
                fasta_handler=fasta_handler,
                bam_handler=bam_handler,
                chromosome_name=chromosome_name,
                pileup_start=pileup_start,
                pileup_end=pileup_end)

            if w == 10:
                exit()
Exemplo n.º 13
0
def get_chromosome_stats(genome_data, reference_path, chromosome_name, start,
                         stop, output_dir, bam_path):
    fasta_handler = FastaHandler(reference_file_path=reference_path)
    bam_handler = BamHandler(bam_file_path=bam_path)

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                  start=start,
                                  stop=stop)

    read_data, chromosome_data = parse_reads(
        reads=reads,
        chromosome_name=chromosome_name,
        chromosome_length=chromosome_length,
        fasta_handler=fasta_handler)

    genome_data.append(chromosome_data)

    export_chromosome_summary_to_csv(read_data=read_data,
                                     chromosome_data=chromosome_data,
                                     output_dir=output_dir,
                                     bam_path=bam_path,
                                     chromosome_name=chromosome_name)
Exemplo n.º 14
0
def test_window(bam_file_path,
                reference_file_path,
                chromosome_name,
                window,
                output_dir,
                save_data=True,
                print_results=False):
    """
    Run the pileup generator for a single specified window
    :param bam_file_path:
    :param reference_file_path:
    :param chromosome_name:
    :param window:
    :return:
    """
    bam_handler = BamHandler(bam_file_path)
    fasta_handler = FastaHandler(reference_file_path)

    pileup_start = window[0]
    pileup_end = window[1]  # add random variation here ?

    ref_sequence, read_ids, sequences = get_aligned_segments(
        fasta_handler=fasta_handler,
        bam_handler=bam_handler,
        chromosome_name=chromosome_name,
        pileup_start=pileup_start,
        pileup_end=pileup_end)

    if print_results:
        print_segments(ref_sequence, sequences)

    if save_data:
        filename = "test_" + str(pileup_start) + ".fasta"
        output_path = os.path.join(output_dir, filename)

        if not os.path.exists(output_dir):
            FileManager.ensure_directory_exists(output_dir)

        fasta_writer = FastaWriter(output_path)
        fasta_writer.write_sequences(sequences)
Exemplo n.º 15
0
def main():
    # bam_file_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_sequence_2019_3_27_14_59_24_409353/sequence_subset_test_60x_10kb_rle_VS_refEcoli_rle.sorted.bam"
    # ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"

    bam_file_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_output_2019_4_8_17_33_14_191911/runnie_subset_test_60x_10kb_rle_VS_refEcoli_rle.sorted.bam"
    ref_fasta_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_output_2019_4_8_17_33_14_191911/refEcoli_rle.fasta"
    # -------------------------------------------------------------------------

    fasta_handler = FastaHandler(ref_fasta_path)
    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    bam_handler = BamHandler(bam_file_path)
    fasta_handler = FastaHandler(ref_fasta_path)

    pileup_start = 0
    pileup_end = pileup_start + 1000  # add random variation here ?

    aligned_segments = get_aligned_segments(fasta_handler=fasta_handler,
                                            bam_handler=bam_handler,
                                            chromosome_name=chromosome_name,
                                            pileup_start=pileup_start,
                                            pileup_end=pileup_end,
                                            include_ref=True)

    encoding = list()
    for alignment in aligned_segments.values():
        encoding.append(list(map(get_encoding, alignment)))

    encoding = -numpy.array(encoding, dtype=numpy.float)

    pyplot.imshow(encoding)
    pyplot.show()
    pyplot.close()
Exemplo n.º 16
0
def get_chromosome_data(bam_path, reference_path, chromosome_name, output_dir,
                        centromere_table_path, gap_table_path,
                        segdup_table_path, genome_data):
    fasta_handler = FastaHandler(reference_path)
    bam_handler = BamHandler(bam_file_path=bam_path)

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    start = 0
    stop = chromosome_length

    reads = bam_handler.get_reads(chromosome_name=chromosome_name,
                                  start=start,
                                  stop=stop)

    read_data, chromosome_data = parse_reads(
        reads=reads,
        fasta_handler=fasta_handler,
        chromosome_name=chromosome_name,
        chromosome_length=chromosome_length)

    genome_data.append(chromosome_data)

    # Calculate total identity, and approximate 0 if denominator is zero for F and R
    total_weighted_identity = sum(
        [x[ALIGNMENT_LENGTH] * x[SEQUENCE_IDENTITY] for x in read_data])
    total_alignment_bases = sum([x[ALIGNMENT_LENGTH] for x in read_data])
    total_identity = total_weighted_identity / max(1e-9, total_alignment_bases)
    total_identity = round(total_identity, 6)

    export_chromosome_summary_to_csv(read_data=read_data,
                                     chromosome_data=chromosome_data,
                                     output_dir=output_dir,
                                     bam_path=bam_path,
                                     chromosome_name=chromosome_name)

    if centromere_table_path is not None:
        centromere_coordinates = read_centromere_table(
            centromere_table_path=centromere_table_path,
            target_chromosome_name=chromosome_name)
    else:
        centromere_coordinates = None

    if gap_table_path is not None:
        gap_coordinates = read_gap_table(
            table_path=gap_table_path, target_chromosome_name=chromosome_name)
    else:
        gap_coordinates = None

    if segdup_table_path is not None:
        segdup_coordinates = read_gap_table(
            table_path=segdup_table_path,
            target_chromosome_name=chromosome_name,
            size_cutoff=10000)
    else:
        segdup_coordinates = None

    figure, axes = plot_contigs(output_dir=output_dir,
                                read_data=read_data,
                                chromosome_name=chromosome_name,
                                chromosome_length=chromosome_length,
                                total_identity=total_identity,
                                bam_path=bam_path,
                                centromere_coordinates=centromere_coordinates,
                                gap_coordinates=gap_coordinates,
                                segdup_coordinates=segdup_coordinates,
                                show=False)

    pyplot.close(figure)
Exemplo n.º 17
0
def generate_window_run_length_encoding(bam_file_path,
                                        reference_file_path,
                                        chromosome_name,
                                        window,
                                        output_dir,
                                        sort_sequences_by_length=False,
                                        reverse_sort=False,
                                        two_pass=False,
                                        save_data=True,
                                        print_results=False,
                                        plot_results=False,
                                        counter=None,
                                        n_chunks=None):
    """
    Run the pileup generator for a single specified window
    :param bam_file_path:
    :param reference_file_path:
    :param chromosome_name:
    :param window:
    :return:
    """
    bam_handler = BamHandler(bam_file_path)
    fasta_handler = FastaHandler(reference_file_path)

    pileup_start = window[0]
    pileup_end = window[1]  # add random variation here ?

    ref_sequence, read_ids, sequences, reversal_statuses = get_aligned_segments(
        fasta_handler=fasta_handler,
        bam_handler=bam_handler,
        chromosome_name=chromosome_name,
        pileup_start=pileup_start,
        pileup_end=pileup_end,
        include_ref=True)

    if sequences is None:
        return

    if sort_sequences_by_length:
        for sequence in sequences:
            print(sequence)
        print()

        sequences = sorted(sequences,
                           key=lambda x: len(x),
                           reverse=reverse_sort)
        for sequence in sequences:
            print(sequence)

        print("ref", ref_sequence)

    sequences, repeats = collapse_repeats(sequences)
    ref_sequence, ref_repeats = collapse_repeats([ref_sequence])
    ref_sequence = ref_sequence[0]

    alignments, ref_alignment = get_spoa_alignment(sequences=sequences,
                                                   ref_sequence=ref_sequence,
                                                   two_pass=two_pass)

    pileup_matrix, pileup_repeat_matrix = convert_collapsed_alignments_to_one_hot_tensor(
        alignments, repeats, fixed_coverage=False)

    reference_matrix, reference_repeat_matrix = convert_collapsed_alignments_to_one_hot_tensor(
        ref_alignment, ref_repeats, fixed_coverage=False)

    reversal_matrix = convert_reversal_statuses_to_integer_matrix(
        reverse_statuses=reversal_statuses, pileup_matrix=pileup_matrix)

    if plot_results:
        n_channels, height, width = pileup_matrix.shape

        x_pileup = pileup_matrix.reshape([n_channels, height, width])
        y_pileup = reference_matrix.reshape([5, 1, width])
        x_repeat = pileup_repeat_matrix.reshape([1, height, width])
        y_repeat = reference_repeat_matrix.reshape([1, width])
        reversal = reversal_matrix.reshape([1, height, width])

        x_pileup_flat = flatten_one_hot_tensor(x_pileup)
        y_pileup_flat = flatten_one_hot_tensor(y_pileup)
        plot_runlength_prediction_stranded(x_pileup=x_pileup_flat,
                                           x_repeat=x_repeat.squeeze(),
                                           y_pileup=y_pileup_flat,
                                           y_repeat=y_repeat,
                                           reversal=reversal.squeeze(),
                                           show_reversal=False,
                                           label=True)

    if print_results:
        print_segments(ref_sequence, sequences)

        for a, alignstring in enumerate(alignments):
            print("{0:15s} {1:s}".format(str(a), alignstring))

        for alignstring in ref_alignment:
            print("{0:15s} {1:s}".format("ref", alignstring))

    if ref_alignment[0].replace("-", '') != ref_sequence:
        print("Aligned reference does not match true reference at [%d,%d]" %
              (pileup_start, pileup_end))
        print("unaligned:\t", ref_sequence)
        print("aligned:\t", ref_alignment[0][1].replace("-", ''))

    elif save_data:
        save_run_length_training_data(
            output_dir=output_dir,
            pileup_matrix=pileup_matrix,
            reference_matrix=reference_matrix,
            pileup_repeat_matrix=pileup_repeat_matrix,
            reference_repeat_matrix=reference_repeat_matrix,
            reversal_matrix=reversal_matrix,
            chromosome_name=chromosome_name,
            start=pileup_start)

    if counter is not None:
        counter.value += 1

        sys.stdout.write('\r' + "%.2f%% Completed" %
                         (100 * counter.value / n_chunks))
Exemplo n.º 18
0
def main():
    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    read_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/guppy/subsampled/11-29/r94_ec_rad2.30x-30kb.fasta"

    # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/v2/rad2_pass_runnie_0_1_10_11_12_13_v2.fa"
    # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_14_29_ecoli_wg_guppy_NO_BAYES/Assembly.fasta"
    # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_15_40_ecoli_wg_guppy_BAYES/Assembly.fasta"
    # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/rad2_pass_runnie_0_v2.fa"

    # ---- TEST DATA ----
    # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_ref.fasta"
    # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_reads.fasta"
    # -------------------

    output_parent_dir = "output/"
    output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string(
    )
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(
        os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir,
                                            runlength_ref_fasta_filename)

    read_fasta_filename_prefix = ".".join(
        os.path.basename(read_fasta_path).split(".")[:-1])
    runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta"
    runlength_read_fasta_path = os.path.join(output_dir,
                                             runlength_read_fasta_filename)

    sys.stderr.write("RL encoding fasta...\n")

    runlength_ref_sequences = runlength_encode_fasta(
        fasta_sequence_path=ref_fasta_path)
    runlength_read_sequences = runlength_encode_fasta(
        fasta_sequence_path=read_fasta_path)

    sys.stderr.write("Aligning RLE fasta...\n")

    read_vs_ref_bam_path = align_as_RLE(
        runlength_reference_path=runlength_ref_fasta_path,
        runlength_ref_sequences=runlength_ref_sequences,
        runlength_read_path=runlength_read_fasta_path,
        runlength_read_sequences=runlength_read_sequences,
        output_dir=output_dir)

    bam_handler = BamHandler(read_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    print(chromosome_length)

    sequences, lengths = get_read_segments(
        fasta_handler=fasta_handler,
        bam_handler=bam_handler,
        chromosome_name=chromosome_name,
        pileup_start=100000,
        pileup_end=100000 + 100,
        runlength_ref_sequences=runlength_ref_sequences,
        read_data=runlength_read_sequences)

    for k, key in enumerate(sequences):
        print(key)
        print(sequences[key][:10])
        print(lengths[key][:10])
Exemplo n.º 19
0
def main():
    # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_ref.fasta"
    # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_reads.fasta"
    # matrix_path = "/home/ryan/code/runnie_parser/output/runlength_matrix_from_assembly_contigs_2019_3_19_13_29_14_657613/probability_matrices_2019_3_19_13_29_19_362916.csv"

    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    read_fasta_path = "/home/ryan/code/runlength_analysis/data/sequence_subset_ecoli_guppy-runnie_60x_test.fastq"
    matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_sequence_2019_4_5_15_29_28_403950/probability_matrices_2019_4_5_15_35_57_920301.csv"

    output_parent_dir = "output/"
    output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string(
    )
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(
        os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir,
                                            runlength_ref_fasta_filename)

    read_fasta_filename_prefix = ".".join(
        os.path.basename(read_fasta_path).split(".")[:-1])
    runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta"
    runlength_read_fasta_path = os.path.join(output_dir,
                                             runlength_read_fasta_filename)

    runlength_ref_sequences = runlength_encode_fasta(
        fasta_sequence_path=ref_fasta_path)
    runlength_read_sequences = runlength_encode_fasta(
        fasta_sequence_path=read_fasta_path)

    read_vs_ref_bam_path = align_as_RLE(
        runlength_reference_path=runlength_ref_fasta_path,
        runlength_ref_sequences=runlength_ref_sequences,
        runlength_read_path=runlength_read_fasta_path,
        runlength_read_sequences=runlength_read_sequences,
        output_dir=output_dir)

    bam_handler = BamHandler(read_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    windows = chunk_chromosome_coordinates(chromosome_length=chromosome_length,
                                           chunk_size=1000)

    # Initialize empty confusion matrices
    total_confusion = get_runlength_confusion([], [], 10)
    total_modal_confusion = get_runlength_confusion([], [], 10)

    length_classifier = RunlengthClassifier(matrix_path)

    print("reading BAM")
    for pileup_start, pileup_end in windows[:10]:
        print("window", pileup_start, pileup_end)

        sys.stderr.write("\r%s" % pileup_start)
        aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_lengths, reversal_statuses = \
            get_aligned_segments(fasta_handler=fasta_handler,
                                 bam_handler=bam_handler,
                                 chromosome_name=chromosome_name,
                                 pileup_start=pileup_start,
                                 pileup_end=pileup_end,
                                 runlength_ref_sequences=runlength_ref_sequences,
                                 read_data=runlength_read_sequences)

        sequence_encoding = list()
        length_encoding = list()
        reversal_encoding = list()

        # No reads here?
        if len(aligned_sequences) == 0:
            continue

        # print("REF\t", "".join(aligned_ref_sequence))
        for read_id in aligned_sequences.keys():
            # print("READ\t","".join(aligned_sequences[read_id]))
            sequence_encoding.append(
                list(map(get_encoding, aligned_sequences[read_id])))
            length_encoding.append(aligned_lengths[read_id])
            reversal_encoding.append(reversal_statuses[read_id])

        ref_sequence_encoding = [list(map(get_encoding, aligned_ref_sequence))]
        ref_lengths_encoding = [aligned_ref_lengths]

        ref_sequence_encoding = numpy.array(ref_sequence_encoding,
                                            dtype=numpy.int)
        ref_length_encoding = numpy.array(ref_lengths_encoding,
                                          dtype=numpy.int)
        sequence_encoding = numpy.array(sequence_encoding, dtype=numpy.int)
        length_encoding = numpy.array(length_encoding, dtype=numpy.float)
        reversal_encoding = numpy.array(reversal_encoding, dtype=numpy.bool)

        ref_sequence_encoding = numpy.atleast_2d(ref_sequence_encoding)
        ref_length_encoding = numpy.atleast_2d(ref_length_encoding)
        sequence_encoding = numpy.atleast_2d(sequence_encoding)
        length_encoding = numpy.atleast_2d(length_encoding)

        # plot_runlength_pileup(sequences=-sequence_encoding,
        #                       lengths=length_encoding,
        #                       ref_sequence=-ref_sequence_encoding,
        #                       ref_lengths=ref_length_encoding)

        consensus_sequence, consensus_lengths = \
            get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier,
                                                         sequence_encoding=sequence_encoding,
                                                         length_encoding=length_encoding,
                                                         reversal_encoding=reversal_encoding)

        modal_consensus_sequence, modal_consensus_lengths = \
            get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier,
                                                         sequence_encoding=sequence_encoding,
                                                         length_encoding=length_encoding,
                                                         reversal_encoding=reversal_encoding,
                                                         bayesian=False)

        print()
        print("PREDICTED\t", consensus_lengths[:10])
        print("TRUE\t\t", aligned_ref_lengths[:10])

        confusion = get_runlength_confusion(
            true_lengths=aligned_ref_lengths,
            predicted_lengths=consensus_lengths,
            max_length=10)

        total_confusion += confusion

        modal_confusion = get_runlength_confusion(
            true_lengths=aligned_ref_lengths,
            predicted_lengths=modal_consensus_lengths,
            max_length=10)

        total_modal_confusion += modal_confusion

        # except Exception as e:
        #     print(e)
        #     continue
    print()

    accuracy = get_accuracy_from_confusion_matrix(total_confusion)

    print("Bayes:", accuracy)

    accuracy = get_accuracy_from_confusion_matrix(total_modal_confusion)

    print("No Bayes", accuracy)

    plot_filename = "confusion.png"
    plot_path = os.path.join(output_dir, plot_filename)

    figure = pyplot.figure()
    axes = pyplot.axes()
    axes.set_xlabel("Predicted")
    axes.set_ylabel("True")

    pyplot.imshow(numpy.log10(total_confusion))
    pyplot.show()
    figure.savefig(plot_path)

    pyplot.close()

    plot_filename = "modal_confusion.png"
    plot_path = os.path.join(output_dir, plot_filename)

    figure = pyplot.figure()
    axes = pyplot.axes()
    axes.set_xlabel("Predicted")
    axes.set_ylabel("True")

    pyplot.imshow(numpy.log10(total_modal_confusion))
    pyplot.show()
    figure.savefig(plot_path)

    pyplot.close()
Exemplo n.º 20
0
def main():
    # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta"
    # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out"

    # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runnie_test_2019_4_8_14_33_30_333396_ref.fasta"
    # runlength_path = "/home/ryan/code/runlength_analysis/data/synthetic_runnie_test_2019_4_8_14_33_30_333396_runnie.out"

    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    runlength_path = "/home/ryan/code/runlength_analysis/data/runnie_subset_test_flipflop_regional_0to10k.out"

    # WG ecoli 60x
    matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_WG_train_60x_guppy_2019_4_23/probability_matrices_2019_4_23_15_9_14_837893.csv"
    raw_matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_WG_train_60x_guppy_2019_4_23/frequency_matrices_2019_4_23_15_9_14_833128.csv"

    output_parent_dir = "output/"
    output_dir = "runlength_prediction_from_runnie_output_" + FileManager.get_datetime_string(
    )
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(
        os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir,
                                            runlength_ref_fasta_filename)

    assembly_fasta_filename_prefix = ".".join(
        os.path.basename(runlength_path).split(".")[:-1])
    runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta"
    runlength_assembly_fasta_path = os.path.join(
        output_dir, runlength_assembly_fasta_filename)

    handler = RunlengthHandler(runlength_path)

    reads = handler.iterate_file(sequence_cutoff=sys.maxsize,
                                 print_status=True)
    read_data = dict()

    for r, read in enumerate(reads):
        read_data[read.id] = read

    print("\nRLE encoding reference sequence...")

    runlength_ref_sequences = runlength_encode_fasta(
        fasta_sequence_path=ref_fasta_path)

    assembly_vs_ref_bam_path = align_as_RLE(
        runlength_reference_path=runlength_ref_fasta_path,
        runlength_ref_sequences=runlength_ref_sequences,
        runlength_read_path=runlength_assembly_fasta_path,
        runlength_read_sequences=read_data,
        output_dir=output_dir)

    bam_handler = BamHandler(assembly_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    windows = chunk_chromosome_coordinates(chromosome_length=chromosome_length,
                                           chunk_size=1000)

    total_confusion = get_runlength_confusion([], [], 10)
    total_confusion_weibull = get_runlength_confusion([], [], 10)

    length_classifier = RunlengthClassifier(matrix_path)
    # length_classifier_weibull = WeibullRunlengthClassifier(matrix_path)
    length_classifier_weibull = WeibullRunlengthClassifier(
        raw_matrix_path, normalize_matrix=True, pseudocount=0.05)

    print("reading BAM")
    for pileup_start, pileup_end in windows[10:20]:
        sys.stderr.write("\r%s" % pileup_start)
        aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_scales, aligned_shapes, reversal_statuses = \
            get_aligned_segments(fasta_handler=fasta_handler,
                                 bam_handler=bam_handler,
                                 chromosome_name=chromosome_name,
                                 pileup_start=pileup_start,
                                 pileup_end=pileup_end,
                                 runlength_ref_sequences=runlength_ref_sequences,
                                 read_data=read_data)

        sequence_encoding = list()
        scale_encoding = list()
        shape_encoding = list()
        modes_encoding = list()
        reversal_encoding = list()

        # No reads here?
        if len(aligned_sequences) == 0:
            continue

        try:
            # print("REF\t", "".join(aligned_ref_sequence))
            for read_id in aligned_sequences.keys():
                # print("READ\t%s\t%s" % (read_id,"".join(aligned_sequences[read_id])))
                sequence_encoding.append(
                    list(map(get_encoding, aligned_sequences[read_id])))
                scale_encoding.append(aligned_scales[read_id])
                shape_encoding.append(aligned_shapes[read_id])
                modes_encoding.append(
                    list(
                        map(
                            map_parameters_to_mode,
                            zip(aligned_scales[read_id],
                                aligned_shapes[read_id]))))
                reversal_encoding.append(reversal_statuses[read_id])

            ref_sequence_encoding = [
                list(map(get_encoding, aligned_ref_sequence))
            ]
            ref_lengths_encoding = [aligned_ref_lengths]

            ref_sequence_encoding = numpy.atleast_2d(
                numpy.array(ref_sequence_encoding, dtype=numpy.int))
            ref_length_encoding = numpy.atleast_2d(
                numpy.array(ref_lengths_encoding, dtype=numpy.int))
            sequence_encoding = numpy.atleast_2d(
                numpy.array(sequence_encoding, dtype=numpy.int))
            scale_encoding = numpy.atleast_2d(
                numpy.array(scale_encoding, dtype=numpy.float))
            shape_encoding = numpy.atleast_2d(
                numpy.array(shape_encoding, dtype=numpy.float))
            modes_encoding = numpy.atleast_2d(
                numpy.array(modes_encoding, dtype=numpy.int))
            reversal_encoding = numpy.array(reversal_encoding,
                                            dtype=numpy.bool)

            consensus_sequence, consensus_lengths = \
                get_consensus_from_modal_pileup_encoding(length_classifier=length_classifier,
                                                         sequence_encoding=sequence_encoding,
                                                         length_encoding=modes_encoding,
                                                         reversal_encoding=reversal_encoding)

            weibull_consensus_sequence, weibull_consensus_lengths = \
                get_consensus_from_weibull_pileup_encoding(length_classifier=length_classifier_weibull,
                                                           sequence_encoding=sequence_encoding,
                                                           scale_encoding=scale_encoding,
                                                           shape_encoding=shape_encoding,
                                                           reversal_encoding=reversal_encoding)

            plot_runlength_pileup(
                sequences=-sequence_encoding,
                scales=scale_encoding,
                shapes=shape_encoding,
                modes=modes_encoding,
                ref_sequence=-ref_sequence_encoding,
                ref_lengths=ref_length_encoding,
                predicted_sequence=-numpy.atleast_2d(
                    numpy.array(weibull_consensus_sequence, dtype=numpy.int)),
                predicted_lengths=numpy.atleast_2d(
                    numpy.array(weibull_consensus_lengths, dtype=numpy.int)))

            print()
            print("PREDICTED\t", weibull_consensus_lengths[:10])
            print("TRUE\t\t", aligned_ref_lengths[:10])

            confusion = get_runlength_confusion(
                true_lengths=aligned_ref_lengths,
                predicted_lengths=consensus_lengths,
                max_length=10)

            confusion_weibull = get_runlength_confusion(
                true_lengths=aligned_ref_lengths,
                predicted_lengths=weibull_consensus_lengths,
                max_length=10)

            total_confusion += confusion
            total_confusion_weibull += confusion_weibull

        except Exception as e:
            print(e)
            continue
    print()

    accuracy = get_accuracy_from_confusion_matrix(total_confusion)

    print("Modal: ", accuracy)

    accuracy = get_accuracy_from_confusion_matrix(total_confusion_weibull)

    print("Full: ", accuracy)

    plot_filename = "confusion.png"
    plot_path = os.path.join(output_dir, plot_filename)

    figure = pyplot.figure()

    axes = pyplot.axes()
    axes.set_xlabel("Predicted")
    axes.set_ylabel("True")

    pyplot.imshow(numpy.log10(total_confusion))
    pyplot.show()
    figure.savefig(plot_path)

    pyplot.close()

    plot_filename = "confusion_weibull.png"
    plot_path = os.path.join(output_dir, plot_filename)

    figure = pyplot.figure()

    axes = pyplot.axes()
    axes.set_xlabel("Predicted")
    axes.set_ylabel("True")

    pyplot.imshow(numpy.log10(total_confusion_weibull))
    pyplot.show()
    figure.savefig(plot_path)

    pyplot.close()
Exemplo n.º 21
0
def main():
    # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta"
    # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out"

    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    runlength_path = "/home/ryan/code/runlength_analysis/data/runnie_subset_test_flipflop_regional_0to10k.out"

    pileup_start = 6000
    pileup_end = 6050

    output_parent_dir = "output/"
    output_dir = "runlength_pileup_test_" + FileManager.get_datetime_string()
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename)

    assembly_fasta_filename_prefix = ".".join(os.path.basename(runlength_path).split(".")[:-1])
    runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta"
    runlength_assembly_fasta_path = os.path.join(output_dir, runlength_assembly_fasta_filename)

    handler = RunlengthHandler(runlength_path)

    reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True)

    read_data = dict()

    for r, read in enumerate(reads):
        read_data[read.id] = read

    print("\nRLE encoding reference sequence...")

    runlength_ref_sequences = runlength_encode_fasta(fasta_sequence_path=ref_fasta_path)

    assembly_vs_ref_bam_path = align_as_RLE(runlength_reference_path=runlength_ref_fasta_path,
                                            runlength_ref_sequences=runlength_ref_sequences,
                                            runlength_read_path=runlength_assembly_fasta_path,
                                            runlength_read_sequences=read_data,
                                            output_dir=output_dir)

    bam_handler = BamHandler(assembly_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_scales, aligned_shapes, reversal_statuses = \
        get_aligned_segments(fasta_handler=fasta_handler,
                             bam_handler=bam_handler,
                             chromosome_name=chromosome_name,
                             pileup_start=pileup_start,
                             pileup_end=pileup_end,
                             runlength_ref_sequences=runlength_ref_sequences,
                             read_data=read_data)

    sequence_encoding = list()
    scale_encoding = list()
    shape_encoding = list()
    modes_encoding = list()

    print(len(aligned_sequences.keys()))

    print("REF\t", "".join(aligned_ref_sequence))
    for read_id in aligned_sequences.keys():
        print("READ\t%s\t%s" % (read_id, "".join(aligned_sequences[read_id])))
        sequence_encoding.append(list(map(get_encoding, aligned_sequences[read_id])))
        scale_encoding.append(aligned_scales[read_id])
        shape_encoding.append(aligned_shapes[read_id])
        modes_encoding.append(list(map(map_parameters_to_mode, zip(aligned_scales[read_id], aligned_shapes[read_id]))))

    sequence_encoding = -numpy.array(sequence_encoding, dtype=numpy.float)
    scale_encoding = numpy.array(scale_encoding, dtype=numpy.float)
    shape_encoding = numpy.array(shape_encoding, dtype=numpy.float)
    modes_encoding = numpy.array(modes_encoding, dtype=numpy.float)

    plot_runlength_pileup(sequences=sequence_encoding,
                          scales=scale_encoding,
                          shapes=shape_encoding,
                          modes=modes_encoding)
Exemplo n.º 22
0
def main():
    # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta"
    # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out"

    ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"
    runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/test/rad2_pass_runnie_4_5_6_7.out"

    output_parent_dir = "output/"
    output_dir = "runlength_matrix_from_runnie_output_" + FileManager.get_datetime_string(
    )
    output_dir = os.path.join(output_parent_dir, output_dir)
    FileManager.ensure_directory_exists(output_dir)

    ref_fasta_filename_prefix = ".".join(
        os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir,
                                            runlength_ref_fasta_filename)

    assembly_fasta_filename_prefix = ".".join(
        os.path.basename(runlength_path).split(".")[:-1])
    runlength_read_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta"
    runlength_read_fasta_path = os.path.join(output_dir,
                                             runlength_read_fasta_filename)

    handler = RunlengthHandler(runlength_path)

    reads = handler.iterate_file(sequence_cutoff=sys.maxsize,
                                 print_status=True)
    read_data = dict()

    for r, read in enumerate(reads):
        read_data[read.id] = read

    print("\nRLE encoding reference sequence...")

    runlength_ref_sequences = runlength_encode_fasta(
        fasta_sequence_path=ref_fasta_path)

    assembly_vs_ref_bam_path = align_as_RLE(
        runlength_reference_path=runlength_ref_fasta_path,
        runlength_ref_sequences=runlength_ref_sequences,
        runlength_read_path=runlength_read_fasta_path,
        runlength_read_sequences=read_data,
        output_dir=output_dir)

    bam_handler = BamHandler(assembly_vs_ref_bam_path)
    fasta_handler = FastaHandler(runlength_ref_fasta_path)

    contig_names = fasta_handler.get_contig_names()
    chromosome_name = contig_names[0]
    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    sequences, scales, shapes = get_read_segments(
        fasta_handler=fasta_handler,
        bam_handler=bam_handler,
        chromosome_name=chromosome_name,
        pileup_start=100000,
        pileup_end=100000 + 100,
        runlength_ref_sequences=runlength_ref_sequences,
        read_data=read_data)

    for k, key in enumerate(sequences):
        print(key)
        print(sequences[key][:10])
        print(scales[key][:10])
        print(shapes[key][:10])