예제 #1
0
def iteratively_align_as_RLE(ref_fasta_path, read_fasta_path, output_dir):
    """
    Given 2 fasta files for reads and reference, iterate them, runlength encode their sequences, and write the RLE
    sequences to a new file, then align them with minimap2
    :param ref_fasta_path:
    :param read_fasta_path:
    :param output_dir:
    :return:
    """
    ref_fasta_filename_prefix = ".".join(
        os.path.basename(ref_fasta_path).split(".")[:-1])
    runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta"
    runlength_ref_fasta_path = os.path.join(output_dir,
                                            runlength_ref_fasta_filename)

    read_fasta_filename_prefix = ".".join(
        os.path.basename(read_fasta_path).split(".")[:-1])
    runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta"
    runlength_read_fasta_path = os.path.join(output_dir,
                                             runlength_read_fasta_filename)

    print("SAVING run length fasta file:", runlength_ref_fasta_path)
    print("SAVING run length fasta file:", runlength_read_fasta_path)

    with open(runlength_ref_fasta_path, "w") as file:
        fasta_handler = FastaHandler(ref_fasta_path)
        names = fasta_handler.get_contig_names()

        for name in names:
            sequence = fasta_handler.get_sequence(chromosome_name=name,
                                                  start=None,
                                                  stop=None)
            sequence, lengths = runlength_encode(sequence)

            file.write(">" + name + " RLE\n")
            file.write(sequence + "\n")

    with open(runlength_read_fasta_path, "w") as file:
        fasta_handler = FastaHandler(read_fasta_path)
        names = fasta_handler.get_contig_names()

        for name in names:
            sequence = fasta_handler.get_sequence(chromosome_name=name,
                                                  start=None,
                                                  stop=None)
            sequence, lengths = runlength_encode(sequence)

            file.write(">" + name + " RLE\n")
            file.write(sequence + "\n")

    output_sam_file_path, output_bam_file_path = align_minimap(
        output_dir=output_dir,
        ref_sequence_path=runlength_ref_fasta_path,
        reads_sequence_path=runlength_read_fasta_path)

    return output_bam_file_path
예제 #2
0
def runlength_encode_parallel(fasta_sequence_path, contig_name,
                              runlength_sequences, min_length):
    fasta_handler = FastaHandler(fasta_sequence_path)

    try:
        sequence = fasta_handler.get_sequence(chromosome_name=contig_name,
                                              start=None,
                                              stop=None)
    except ValueError as e:
        print(e)
        print("ERROR: pysam fetch failed on contig: %s" % contig_name)
        return

    if len(sequence) < min_length:
        return

    character_sequence = [numpy.uint8(x) for x in range(0)]
    character_counts = [numpy.uint8(x) for x in range(0)]
    current_character = ""

    for character in sequence:
        if character != current_character:
            character_sequence.append(character)
            character_counts.append(1)
        else:
            character_counts[-1] += 1

        current_character = character

    character_sequence = ''.join(character_sequence)

    runlength_sequences[contig_name] = (character_sequence, character_counts)

    sys.stderr.write("\rRun length encoded %s            " % contig_name)
예제 #3
0
def main():
    # ---- Nanopore GUPPY - C ELEGANS - (dev machine) -------------------------
    bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam"
    reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta"
    # -------------------------------------------------------------------------

    fasta_handler = FastaHandler(reference_file_path)

    # chromosomal_window_path = "output/window_selection/NC_003279.8_0_15072434_2018_10_1_20_1"   # kernel method
    chromosomal_window_path = "/home/ryan/code/nanopore_assembly/output/window_selection/NC_003279.8_0_15072434_2018_10_12_10_58_56_199382"  # transition method
    chromosome_name = "NC_003279.8"

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    reference_sequence = fasta_handler.get_sequence(
        chromosome_name=chromosome_name, start=0, stop=chromosome_length)

    windows = load_windows(chromosomal_window_path)

    long_repeat_positions = find_repeats(sequence=reference_sequence,
                                         repeat_threshold=1)

    split_counts_per_length, split_repeat_windows, unsplit_repeat_windows = \
        locate_repeats_in_anchored_windows(windows=windows, repeat_positions=long_repeat_positions)

    plot_split_ratios_per_length(split_counts_per_length)
    plot_pileups_for_split_repeats(split_repeat_windows=split_repeat_windows,
                                   bam_file_path=bam_file_path,
                                   reference_file_path=reference_file_path,
                                   chromosome_name=chromosome_name)
예제 #4
0
def runlength_encode_fasta(fasta_sequence_path):
    fasta_handler = FastaHandler(fasta_sequence_path)

    contig_names = fasta_handler.get_contig_names()

    runlength_sequences = dict()

    for contig_name in contig_names:
        sequence = fasta_handler.get_sequence(chromosome_name=contig_name, start=None, stop=None)

        bases, lengths = runlength_encode(sequence)

        runlength_sequences[contig_name] = (bases, lengths)

        sys.stderr.write("\rRun length encoded %s            " % contig_name)

    sys.stderr.write("\n")

    return runlength_sequences
예제 #5
0
def main():
    output_dir = "output/ref_run_lengths/"
    filename_prefix = "ref_runlength_distribution"

    reference_file_path = "/home/ryan/data/Nanopore/Human/paolo/LC2019/kishwar/shasta_assembly_GM24385_chr20.fasta"

    # ---- GIAB E. Coli - (dev machine) -------------------------
    # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa"
    # reference_file_path = "/home/ryan/data/Nanopore/ecoli/refEcoli.fasta"
    # -------------------------------------------------------------------------

    threshold = 5

    fasta_handler = FastaHandler(reference_file_path)
    contig_names = fasta_handler.get_contig_names()

    all_counts = defaultdict(lambda: Counter())

    sys.stderr.write("reading fasta file...\n")
    sys.stderr.flush()

    c = 0
    for chromosome_name in contig_names:
        if len(contig_names) > 1:
            if not chromosome_name != "chr1":
                continue
        c += 1

        # sys.stderr.write("Parsing chromosome %s\n" % chromosome_name)
        # sys.stderr.flush()

        chromosome_length = fasta_handler.get_chr_sequence_length(
            chromosome_name)

        reference_sequence = fasta_handler.get_sequence(
            chromosome_name=chromosome_name, stop=chromosome_length, start=0)

        character_counts = count_runlength_per_character(
            sequence=reference_sequence,
            threshold=threshold,
            chromosome_name=chromosome_name)
예제 #6
0
def runlength_encode_fasta(fasta_sequence_path):
    fasta_handler = FastaHandler(fasta_sequence_path)

    contig_names = fasta_handler.get_contig_names()

    runlength_sequences = dict()

    for contig_name in contig_names:
        chromosome_length = fasta_handler.get_chr_sequence_length(contig_name)

        sequence = fasta_handler.get_sequence(chromosome_name=contig_name,
                                              start=0,
                                              stop=chromosome_length)

        bases, lengths = runlength_encode(sequence)

        runlength_sequences[contig_name] = (bases, lengths)

        print(contig_name, len(bases), len(lengths))

    return runlength_sequences
예제 #7
0
def main(reference_file_path):
    input_prefix_name = os.path.basename(reference_file_path).split("/")[-1].split(".")[0]
    output_dir = os.path.join("output/ref_run_lengths/", input_prefix_name)
    filename_prefix = "ref_runlength_distribution"

    FileManager.ensure_directory_exists(output_dir)

    fasta_handler = FastaHandler(reference_file_path)
    contig_names = fasta_handler.get_contig_names()

    print(contig_names)
    print(sorted([(x,fasta_handler.get_chr_sequence_length(x)) for x in contig_names],key=lambda x: x[1]))

    all_counts = defaultdict(lambda: Counter())
    raw_counts_AT = list()
    raw_counts_GC = list()

    sys.stderr.write("reading fasta file...\n")
    sys.stderr.flush()

    max_count = 100
    step = 1
    c = 0
    for chromosome_name in contig_names:
        # if len(contig_names) > 1:
        #     if not chromosome_name.startswith("chr") or "alt" in chromosome_name or "v" in chromosome_name:
        #         print("WARNING: SKIPPING CHROMOSOME %s" % chromosome_name)
        #         continue

        # if c == 1:
        #     break
        c += 1

        sys.stderr.write("Parsing chromosome %s\n" % chromosome_name)
        sys.stderr.flush()

        chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

        reference_sequence = fasta_handler.get_sequence(chromosome_name=chromosome_name, start=0, stop=chromosome_length)
        character_counts = count_runlength_per_character(reference_sequence)

        figure, axes = pyplot.subplots(nrows=len(character_counts.keys()), sharex=True)
        figure.set_size_inches(6,12)

        for k,key in enumerate(character_counts.keys()):
            counts = character_counts[key]
            counter = Counter(counts)
            all_counts[key] += counter

            if key in {"C","G"}:
                raw_counts_GC += counts

            if key in {"A","T"}:
                raw_counts_AT += counts

            plot_counts_as_histogram(axes=axes[k], counts=counts, max_count=max_count, step=step)

            axes[k].set_ylabel(str(key))
            axes[k].set_ylim([-0.5,10])

        axes[0].set_title(chromosome_name)

        filename = filename_prefix + "_" + chromosome_name + ".png"
        file_path = os.path.join(output_dir, filename)
        figure.savefig(file_path)
        # pyplot.show()
        pyplot.close()

    figure, axes = pyplot.subplots(nrows=2)

    filename = filename_prefix + "_genomic.png"
    file_path = os.path.join(output_dir, filename)

    plot_counts_as_histogram(axes=axes[0], counts=raw_counts_AT, max_count=max_count, step=step)
    plot_counts_as_histogram(axes=axes[1], counts=raw_counts_GC, max_count=max_count, step=step)
    axes[0].set_ylabel("AT Log10 Frequency")
    axes[1].set_ylabel("GC Log10 Frequency")

    figure.savefig(file_path)
    # pyplot.show()
    pyplot.close()

    print_all_counts_as_shasta_matrix(all_counts, max_count=50)
    print_all_counts(all_counts, output_dir)
예제 #8
0
def main():
    # output_root_dir = "output/"
    # instance_dir = "spoa_pileup_generation_" + get_current_timestamp()
    # output_dir = os.path.join(output_root_dir, instance_dir)

    # ---- Nanopore - GUPPY HUMAN - (dev machine) -----------------------------
    # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam"
    # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa"
    # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz"
    # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed"

    # ---- Nanopore GUPPY - C ELEGANS - (dev machine) -------------------------
    # bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.filtered2820.bam"
    # reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta"

    # ---- Nanopore GUPPY - E. Coli - (dev machine) -------------------------
    bam_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli_reads_vs_ref.bam"
    reference_file_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta"

    # -------------------------------------------------------------------------

    fasta_handler = FastaHandler(reference_file_path)
    contig_names = fasta_handler.get_contig_names()
    fasta_handler.close()

    # chromosome_name = "NC_003279.8"     # celegans chr1
    # chromosome_name = "NC_003283.11"     # celegans chr5

    for chromosome_name in contig_names:
        if chromosome_name == "NC_001328.1":    # mitochondrial
            continue

        print("STARTING:", chromosome_name)
        fasta_handler = FastaHandler(reference_file_path)
        chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)
        reference_sequence = fasta_handler.get_sequence(chromosome_name=chromosome_name,
                                                        start=0,
                                                        stop=chromosome_length)

        fasta_handler.close()

        region = [0+1000000, chromosome_length-1000000]

        max_threads = 30

        window_size = 10000
        min_size = 20
        max_size = 80

        manager = multiprocessing.Manager()
        counter = manager.Value('i', 0)

        region_windows = chunk_region(region=region, size=window_size)

        n_chunks = len(region_windows)

        print("subregions: ", n_chunks)

        output_dir = "output/window_selection/" + str(chromosome_name) + "_" + str(region[0]) + "_" + str(region[1]) + "_" + FileManager.get_datetime_string()
        print(output_dir)

        # args = list()
        # for subregion in region_windows:
        #     args.append([bam_file_path, chromosome_name, subregion, reference_sequence, min_size, max_size, output_dir, counter, n_chunks])

        pooled_args = generate_argument_pools(pool_size=max_threads,
                                              bam_file_path=bam_file_path,
                                              chromosome_name=chromosome_name,
                                              region_windows=region_windows,
                                              reference_sequence=reference_sequence,
                                              min_size=min_size,
                                              max_size=max_size,
                                              output_dir=output_dir,
                                              counter=counter,
                                              n_chunks=n_chunks)

        # print(len(pooled_args))
        # s = 0
        # for pool in pooled_args:
        #     s += len(pool)
        #     print(len(pool))
        # print(len(region_windows))
        # print(s)
        # exit()

        for arg_pool in pooled_args:
            # initiate threading
            gc.collect()
            with Pool(processes=max_threads) as pool:
                pool.starmap(select_windows, arg_pool)

    print()
예제 #9
0
def main():
    # output_root_dir = "output/"
    # instance_dir = "spoa_pileup_generation_" + get_current_timestamp()
    # output_dir = os.path.join(output_root_dir, instance_dir)

    # ---- Illumina (laptop) --------------------------------------------------
    # bam_file_path = "/Users/saureous/data/Platinum/chr1.sorted.bam"
    # reference_file_path = "/Users/saureous/data/Platinum/chr1.fa"
    # vcf_path = "/Users/saureous/data/Platinum/NA12878_S1.genome.vcf.gz"
    # bed_path = "/Users/saureous/data/Platinum/chr1_confident.bed"

    # ---- GIAB (dev machine) -------------------------------------------------
    # bam_file_path = "/home/ryan/data/GIAB/NA12878_GIAB_30x_GRCh37.sorted.bam"
    # reference_file_path = "/home/ryan/data/GIAB/GRCh37_WG.fa"
    # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh37.vcf.gz"
    # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed"

    # ---- Nanopore - GUPPY HUMAN - (dev machine) -----------------------------
    # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam"
    # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa"
    # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz"
    # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed"

    # ---- Nanopore GUPPY - C ELEGANS - (dev machine) -------------------------
    bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam"
    reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta"

    # -------------------------------------------------------------------------

    fasta_handler = FastaHandler(reference_file_path)
    contig_names = fasta_handler.get_contig_names()

    chromosome_name = "NC_003279.8"  # celegans chr1
    # chromosome_name = "NC_003283.11"     # celegans chr5
    # chromosome_name = "1"
    # chromosome_name = "chr" + chromosome_name

    chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name)

    reference_sequence = fasta_handler.get_sequence(
        chromosome_name=chromosome_name, start=0, stop=chromosome_length)

    character_counts = count_runlength_per_character(reference_sequence)

    figure, axes = pyplot.subplots(nrows=len(character_counts.keys()),
                                   sharex=True,
                                   sharey=True)

    for k, key in enumerate(character_counts.keys()):
        counts = character_counts[key]
        max_count = max(counts)

        step = 1
        bins = numpy.arange(0, max_count + step, step=step)
        frequencies, bins = numpy.histogram(counts, bins=bins, normed=False)

        print(bins)
        print(frequencies)

        print(bins.shape)
        center = (bins[:-1] + bins[1:]) / 2 - step / 2

        axes[k].bar(center, frequencies, width=step, align="center")
        axes[k].set_ylabel(str(key))
        axes[k].set_xticks(numpy.arange(0, max_count + 1))

    pyplot.show()