def iteratively_align_as_RLE(ref_fasta_path, read_fasta_path, output_dir): """ Given 2 fasta files for reads and reference, iterate them, runlength encode their sequences, and write the RLE sequences to a new file, then align them with minimap2 :param ref_fasta_path: :param read_fasta_path: :param output_dir: :return: """ ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) read_fasta_filename_prefix = ".".join( os.path.basename(read_fasta_path).split(".")[:-1]) runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta" runlength_read_fasta_path = os.path.join(output_dir, runlength_read_fasta_filename) print("SAVING run length fasta file:", runlength_ref_fasta_path) print("SAVING run length fasta file:", runlength_read_fasta_path) with open(runlength_ref_fasta_path, "w") as file: fasta_handler = FastaHandler(ref_fasta_path) names = fasta_handler.get_contig_names() for name in names: sequence = fasta_handler.get_sequence(chromosome_name=name, start=None, stop=None) sequence, lengths = runlength_encode(sequence) file.write(">" + name + " RLE\n") file.write(sequence + "\n") with open(runlength_read_fasta_path, "w") as file: fasta_handler = FastaHandler(read_fasta_path) names = fasta_handler.get_contig_names() for name in names: sequence = fasta_handler.get_sequence(chromosome_name=name, start=None, stop=None) sequence, lengths = runlength_encode(sequence) file.write(">" + name + " RLE\n") file.write(sequence + "\n") output_sam_file_path, output_bam_file_path = align_minimap( output_dir=output_dir, ref_sequence_path=runlength_ref_fasta_path, reads_sequence_path=runlength_read_fasta_path) return output_bam_file_path
def runlength_encode_parallel(fasta_sequence_path, contig_name, runlength_sequences, min_length): fasta_handler = FastaHandler(fasta_sequence_path) try: sequence = fasta_handler.get_sequence(chromosome_name=contig_name, start=None, stop=None) except ValueError as e: print(e) print("ERROR: pysam fetch failed on contig: %s" % contig_name) return if len(sequence) < min_length: return character_sequence = [numpy.uint8(x) for x in range(0)] character_counts = [numpy.uint8(x) for x in range(0)] current_character = "" for character in sequence: if character != current_character: character_sequence.append(character) character_counts.append(1) else: character_counts[-1] += 1 current_character = character character_sequence = ''.join(character_sequence) runlength_sequences[contig_name] = (character_sequence, character_counts) sys.stderr.write("\rRun length encoded %s " % contig_name)
def main(): # ---- Nanopore GUPPY - C ELEGANS - (dev machine) ------------------------- bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam" reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta" # ------------------------------------------------------------------------- fasta_handler = FastaHandler(reference_file_path) # chromosomal_window_path = "output/window_selection/NC_003279.8_0_15072434_2018_10_1_20_1" # kernel method chromosomal_window_path = "/home/ryan/code/nanopore_assembly/output/window_selection/NC_003279.8_0_15072434_2018_10_12_10_58_56_199382" # transition method chromosome_name = "NC_003279.8" chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reference_sequence = fasta_handler.get_sequence( chromosome_name=chromosome_name, start=0, stop=chromosome_length) windows = load_windows(chromosomal_window_path) long_repeat_positions = find_repeats(sequence=reference_sequence, repeat_threshold=1) split_counts_per_length, split_repeat_windows, unsplit_repeat_windows = \ locate_repeats_in_anchored_windows(windows=windows, repeat_positions=long_repeat_positions) plot_split_ratios_per_length(split_counts_per_length) plot_pileups_for_split_repeats(split_repeat_windows=split_repeat_windows, bam_file_path=bam_file_path, reference_file_path=reference_file_path, chromosome_name=chromosome_name)
def runlength_encode_fasta(fasta_sequence_path): fasta_handler = FastaHandler(fasta_sequence_path) contig_names = fasta_handler.get_contig_names() runlength_sequences = dict() for contig_name in contig_names: sequence = fasta_handler.get_sequence(chromosome_name=contig_name, start=None, stop=None) bases, lengths = runlength_encode(sequence) runlength_sequences[contig_name] = (bases, lengths) sys.stderr.write("\rRun length encoded %s " % contig_name) sys.stderr.write("\n") return runlength_sequences
def main(): output_dir = "output/ref_run_lengths/" filename_prefix = "ref_runlength_distribution" reference_file_path = "/home/ryan/data/Nanopore/Human/paolo/LC2019/kishwar/shasta_assembly_GM24385_chr20.fasta" # ---- GIAB E. Coli - (dev machine) ------------------------- # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa" # reference_file_path = "/home/ryan/data/Nanopore/ecoli/refEcoli.fasta" # ------------------------------------------------------------------------- threshold = 5 fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() all_counts = defaultdict(lambda: Counter()) sys.stderr.write("reading fasta file...\n") sys.stderr.flush() c = 0 for chromosome_name in contig_names: if len(contig_names) > 1: if not chromosome_name != "chr1": continue c += 1 # sys.stderr.write("Parsing chromosome %s\n" % chromosome_name) # sys.stderr.flush() chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) reference_sequence = fasta_handler.get_sequence( chromosome_name=chromosome_name, stop=chromosome_length, start=0) character_counts = count_runlength_per_character( sequence=reference_sequence, threshold=threshold, chromosome_name=chromosome_name)
def runlength_encode_fasta(fasta_sequence_path): fasta_handler = FastaHandler(fasta_sequence_path) contig_names = fasta_handler.get_contig_names() runlength_sequences = dict() for contig_name in contig_names: chromosome_length = fasta_handler.get_chr_sequence_length(contig_name) sequence = fasta_handler.get_sequence(chromosome_name=contig_name, start=0, stop=chromosome_length) bases, lengths = runlength_encode(sequence) runlength_sequences[contig_name] = (bases, lengths) print(contig_name, len(bases), len(lengths)) return runlength_sequences
def main(reference_file_path): input_prefix_name = os.path.basename(reference_file_path).split("/")[-1].split(".")[0] output_dir = os.path.join("output/ref_run_lengths/", input_prefix_name) filename_prefix = "ref_runlength_distribution" FileManager.ensure_directory_exists(output_dir) fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() print(contig_names) print(sorted([(x,fasta_handler.get_chr_sequence_length(x)) for x in contig_names],key=lambda x: x[1])) all_counts = defaultdict(lambda: Counter()) raw_counts_AT = list() raw_counts_GC = list() sys.stderr.write("reading fasta file...\n") sys.stderr.flush() max_count = 100 step = 1 c = 0 for chromosome_name in contig_names: # if len(contig_names) > 1: # if not chromosome_name.startswith("chr") or "alt" in chromosome_name or "v" in chromosome_name: # print("WARNING: SKIPPING CHROMOSOME %s" % chromosome_name) # continue # if c == 1: # break c += 1 sys.stderr.write("Parsing chromosome %s\n" % chromosome_name) sys.stderr.flush() chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reference_sequence = fasta_handler.get_sequence(chromosome_name=chromosome_name, start=0, stop=chromosome_length) character_counts = count_runlength_per_character(reference_sequence) figure, axes = pyplot.subplots(nrows=len(character_counts.keys()), sharex=True) figure.set_size_inches(6,12) for k,key in enumerate(character_counts.keys()): counts = character_counts[key] counter = Counter(counts) all_counts[key] += counter if key in {"C","G"}: raw_counts_GC += counts if key in {"A","T"}: raw_counts_AT += counts plot_counts_as_histogram(axes=axes[k], counts=counts, max_count=max_count, step=step) axes[k].set_ylabel(str(key)) axes[k].set_ylim([-0.5,10]) axes[0].set_title(chromosome_name) filename = filename_prefix + "_" + chromosome_name + ".png" file_path = os.path.join(output_dir, filename) figure.savefig(file_path) # pyplot.show() pyplot.close() figure, axes = pyplot.subplots(nrows=2) filename = filename_prefix + "_genomic.png" file_path = os.path.join(output_dir, filename) plot_counts_as_histogram(axes=axes[0], counts=raw_counts_AT, max_count=max_count, step=step) plot_counts_as_histogram(axes=axes[1], counts=raw_counts_GC, max_count=max_count, step=step) axes[0].set_ylabel("AT Log10 Frequency") axes[1].set_ylabel("GC Log10 Frequency") figure.savefig(file_path) # pyplot.show() pyplot.close() print_all_counts_as_shasta_matrix(all_counts, max_count=50) print_all_counts(all_counts, output_dir)
def main(): # output_root_dir = "output/" # instance_dir = "spoa_pileup_generation_" + get_current_timestamp() # output_dir = os.path.join(output_root_dir, instance_dir) # ---- Nanopore - GUPPY HUMAN - (dev machine) ----------------------------- # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore GUPPY - C ELEGANS - (dev machine) ------------------------- # bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.filtered2820.bam" # reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta" # ---- Nanopore GUPPY - E. Coli - (dev machine) ------------------------- bam_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli_reads_vs_ref.bam" reference_file_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" # ------------------------------------------------------------------------- fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() fasta_handler.close() # chromosome_name = "NC_003279.8" # celegans chr1 # chromosome_name = "NC_003283.11" # celegans chr5 for chromosome_name in contig_names: if chromosome_name == "NC_001328.1": # mitochondrial continue print("STARTING:", chromosome_name) fasta_handler = FastaHandler(reference_file_path) chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reference_sequence = fasta_handler.get_sequence(chromosome_name=chromosome_name, start=0, stop=chromosome_length) fasta_handler.close() region = [0+1000000, chromosome_length-1000000] max_threads = 30 window_size = 10000 min_size = 20 max_size = 80 manager = multiprocessing.Manager() counter = manager.Value('i', 0) region_windows = chunk_region(region=region, size=window_size) n_chunks = len(region_windows) print("subregions: ", n_chunks) output_dir = "output/window_selection/" + str(chromosome_name) + "_" + str(region[0]) + "_" + str(region[1]) + "_" + FileManager.get_datetime_string() print(output_dir) # args = list() # for subregion in region_windows: # args.append([bam_file_path, chromosome_name, subregion, reference_sequence, min_size, max_size, output_dir, counter, n_chunks]) pooled_args = generate_argument_pools(pool_size=max_threads, bam_file_path=bam_file_path, chromosome_name=chromosome_name, region_windows=region_windows, reference_sequence=reference_sequence, min_size=min_size, max_size=max_size, output_dir=output_dir, counter=counter, n_chunks=n_chunks) # print(len(pooled_args)) # s = 0 # for pool in pooled_args: # s += len(pool) # print(len(pool)) # print(len(region_windows)) # print(s) # exit() for arg_pool in pooled_args: # initiate threading gc.collect() with Pool(processes=max_threads) as pool: pool.starmap(select_windows, arg_pool) print()
def main(): # output_root_dir = "output/" # instance_dir = "spoa_pileup_generation_" + get_current_timestamp() # output_dir = os.path.join(output_root_dir, instance_dir) # ---- Illumina (laptop) -------------------------------------------------- # bam_file_path = "/Users/saureous/data/Platinum/chr1.sorted.bam" # reference_file_path = "/Users/saureous/data/Platinum/chr1.fa" # vcf_path = "/Users/saureous/data/Platinum/NA12878_S1.genome.vcf.gz" # bed_path = "/Users/saureous/data/Platinum/chr1_confident.bed" # ---- GIAB (dev machine) ------------------------------------------------- # bam_file_path = "/home/ryan/data/GIAB/NA12878_GIAB_30x_GRCh37.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh37_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh37.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore - GUPPY HUMAN - (dev machine) ----------------------------- # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore GUPPY - C ELEGANS - (dev machine) ------------------------- bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.bam" reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta" # ------------------------------------------------------------------------- fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() chromosome_name = "NC_003279.8" # celegans chr1 # chromosome_name = "NC_003283.11" # celegans chr5 # chromosome_name = "1" # chromosome_name = "chr" + chromosome_name chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reference_sequence = fasta_handler.get_sequence( chromosome_name=chromosome_name, start=0, stop=chromosome_length) character_counts = count_runlength_per_character(reference_sequence) figure, axes = pyplot.subplots(nrows=len(character_counts.keys()), sharex=True, sharey=True) for k, key in enumerate(character_counts.keys()): counts = character_counts[key] max_count = max(counts) step = 1 bins = numpy.arange(0, max_count + step, step=step) frequencies, bins = numpy.histogram(counts, bins=bins, normed=False) print(bins) print(frequencies) print(bins.shape) center = (bins[:-1] + bins[1:]) / 2 - step / 2 axes[k].bar(center, frequencies, width=step, align="center") axes[k].set_ylabel(str(key)) axes[k].set_xticks(numpy.arange(0, max_count + 1)) pyplot.show()