def plot_kernel_distribution(pdf, cdf, bins, save=False, output_dir=None, filename=None): n_steps = 100 step = float(1.0 / n_steps) center = (bins[:-1] + bins[1:]) / 2 - step / 2 fig, axes = pyplot.subplots(nrows=2) axes[0].plot(cdf) axes[1].bar(center, pdf, width=step, align="center") axes[1].set_ylabel("kernel sum") if save: FileManager.ensure_directory_exists(output_dir) filename = filename + "_distributions.png" path = os.path.join(output_dir, filename) pyplot.savefig(path) else: pyplot.show() pyplot.close()
def main(max_threads=None): # runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_runnie_0.out" runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_runnie_0_1_10_11_12_13.out" output_parent_dir = "output/version_comparison/mode/" output_dir = "runlength_matrix_from_assembly_contigs_" + FileManager.get_datetime_string() output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) handler = RunlengthHandler(runlength_path) if max_threads is None: max_threads = max(1, multiprocessing.cpu_count()-2) with multiprocessing.Pool(processes=max_threads) as pool: for r,read_id in enumerate(pool.imap(arg_unpacker, arg_iterator(handler=handler, output_dir=output_dir))): sys.stdout.write("\r%d" % r) print() print("Concatenating files...") output_file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=output_dir, file_extension=".fasta") concatenated_filename = os.path.basename(runlength_path).split(".")[0] + ".fasta" concatenated_file_path = os.path.join(output_dir, concatenated_filename) print("Saving to file: %s" % concatenated_file_path) FileManager.concatenate_files(file_paths=output_file_paths, output_file_path=concatenated_file_path) FileManager.delete_files(output_file_paths)
def save_run_length_training_data(output_dir, pileup_matrix, reference_matrix, pileup_repeat_matrix, reference_repeat_matrix, reversal_matrix, chromosome_name, start): array_file_extension = ".npz" # ensure chromosomal directory exists chromosomal_output_dir = os.path.join(output_dir, chromosome_name) if not os.path.exists(chromosomal_output_dir): FileManager.ensure_directory_exists(chromosomal_output_dir) # generate unique filename and path filename = chromosome_name + "_" + str(start) output_path_prefix = os.path.join(chromosomal_output_dir, filename) data_path = output_path_prefix + "_matrix" + array_file_extension # write numpy arrays numpy.savez_compressed(data_path, x_pileup=pileup_matrix, y_pileup=reference_matrix, x_repeat=pileup_repeat_matrix, y_repeat=reference_repeat_matrix, reversal=reversal_matrix)
def main(): """ Make a synthetic reference and a set of reads and save them to fasta files as reads.fasta and ref.fasta :return: """ output_dir = "data/" FileManager.ensure_directory_exists(output_dir) n_coverage = 2 ref_max_runlength = 50 read_max_runlength = 50 ref_sequence, observations = generate_sequences( ref_max_runlength=ref_max_runlength, read_max_runlength=read_max_runlength, n_coverage=n_coverage, scale_coverage=True) datetime_string = FileManager.get_datetime_string() filename = "synthetic_coverage_data_marginpolish_" + datetime_string + ".tsv" output_path = os.path.join(output_dir, filename) file = open(output_path, "w") writer = csv.writer(file, delimiter="\t") for line in observations: writer.writerow(line) file.close() filename = "synthetic_coverage_data_marginpolish_" + datetime_string + "_ref.fasta" output_path = os.path.join(output_dir, filename) with open(output_path, "w") as file: file.write(">ref_0\n") file.write(ref_sequence)
def main(summary_glob, output_dir, filter_decoys, args): FileManager.ensure_directory_exists(output_dir) summary_file_paths = glob.glob(summary_glob) if len(summary_file_paths) == 0: print("No files matched '{}'".format(summary_glob)) sys.exit(1) if filter_decoys: print("Filtering decoy chromosomes") summary_file_paths = filter_decoys_from_paths(summary_file_paths) summary_headers, summary_data, identities, identities_per_file, read_lengths_per_file, read_len_to_identity = \ aggregate_summary_data(summary_file_paths, args) # all_read_lengths = list() # for rli in read_len_to_identity: # all_read_lengths.append(rli[0]) # all_read_lengths.sort() # print("top 15 read lengths: {}".format(all_read_lengths[:-15])) for file in identities_per_file.keys(): mmm(identities_per_file[file], file) mmm(identities, "All Data") sample_name = args.sample if sample_name is None: sample_name = summary_glob.rstrip('/').replace('/', "_").replace( '*', "_") # replace this with sample name extractor function? # plots if args.plot: pass # plot_identity_histogram(identities, title=sample_name, output_location=os.path.join(output_dir, "{}.all_identities.png".format(sample_name))) # plot_read_len_to_identity(read_len_to_identity, title=sample_name, output_base=os.path.join(output_dir, "{}.read_len_to_identity".format(sample_name))) # plot_per_file_identity_curve(identities_per_file, output_base=os.path.join(output_dir, sample_name)) if args.comparison_glob is None: plot_per_file_identity_violin(identities_per_file, title=sample_name, output_base=os.path.join( output_dir, sample_name)) else: comparison_paths = glob.glob(args.comparison_glob) if len(comparison_paths) == 0: raise Exception("No comparison files found for '{}'".format( args.comparison_glob)) #TODO only for rle experiment args.min_read_length *= 0.7 _, _, _, comparison_identities_per_file, comparison_lengths_per_file, _ = aggregate_summary_data( comparison_paths, args) plot_identity_comparison_violin(identities_per_file, comparison_identities_per_file, read_lengths_per_file, comparison_lengths_per_file, title=sample_name, output_base=os.path.join( output_dir, sample_name))
def write_windows_to_file(windows, output_dir, filename): FileManager.ensure_directory_exists(output_dir) filename = filename + "_windows.pkl" path = os.path.join(output_dir, filename) with open(path, 'wb') as output: pickle.dump(windows, output, pickle.HIGHEST_PROTOCOL)
def save_model(output_directory, model): FileManager.ensure_directory_exists(output_directory) timestamp = get_timestamp_string() filename = "model_" + timestamp path = os.path.join(output_directory, filename) print("SAVING MODEL:", path) torch.save(model.state_dict(), path)
def process_bam(bam_path, reference_path): """ Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :return: """ print("\n" + bam_path + "\n") output_dir = "plots/" FileManager.ensure_directory_exists(output_dir) bam_handler = BamHandler(bam_file_path=bam_path) fasta_handler = FastaHandler(reference_path) chromosome_names = ["gi"] for chromosome_name in chromosome_names: chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) start = 0 stop = chromosome_length reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) read_data = parse_reads(reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name) print("chromosome_name:\t", chromosome_name) print("chromosome_length:\t", chromosome_length) for data in read_data: read_id, reversal_status, ref_alignment_start, alignment_length, read_length, contig_length, n_initial_clipped_bases, n_total_mismatches, n_total_deletes, n_total_inserts, identity = data print() print(read_id) print("reversed:\t", reversal_status) print("alignment_start:\t", ref_alignment_start) print("alignment_length:\t", alignment_length) print("n_initial_clipped_bases:", n_initial_clipped_bases) print("n_total_mismatches:\t", n_total_mismatches) print("n_total_deletes:\t", n_total_deletes) print("n_total_inserts:\t", n_total_inserts) print("identity:\t", identity) total_weighted_identity = sum([x[ALIGNMENT_LENGTH] * x[IDENTITY] for x in read_data]) total_alignment_bases = sum([x[ALIGNMENT_LENGTH] for x in read_data]) total_identity = total_weighted_identity/total_alignment_bases print("\nTOTAL IDENTITY:\t", total_identity) plot_contigs(output_dir=output_dir, read_data=read_data, chromosome_name=chromosome_name, chromosome_length=chromosome_length, total_identity=total_identity, bam_path=bam_path, y_min=-1, y_max=4, show=False)
def extract_runnie_reads_by_name(runnie_path, output_dir, output_filename_suffix, names): output_filename = "runnie_subset_" + output_filename_suffix + ".out" output_path = os.path.join(output_dir, output_filename) FileManager.ensure_directory_exists(output_dir) runnie_handler = RunlengthHandler(runnie_path) runnie_handler.extract_reads_by_id(id_set=names, output_path=output_path, print_status=True) return output_path
def extract_fastq_reads_by_name(fastq_path, output_dir, output_filename_suffix, names): output_filename = "sequence_subset_" + output_filename_suffix + ".fastq" output_path = os.path.join(output_dir, output_filename) FileManager.ensure_directory_exists(output_dir) fastq_handler = FastqHandler(fastq_path) fastq_handler.extract_reads_by_id(id_set=names, output_path=output_path, print_status=True) return output_path
def main(reads_file_path, true_ref_sequence_path=None, output_dir=None, n_passes=False): if output_dir is None: output_dir = "./" else: FileManager.ensure_directory_exists(output_dir) assembly_sequence_path = assemble_wtdbg2(output_dir=output_dir, input_file_path=reads_file_path) reads_vs_ref_sam_path, reads_vs_ref_bam_path = align_minimap( output_dir=output_dir, ref_sequence_path=assembly_sequence_path, reads_sequence_path=reads_file_path) if true_ref_sequence_path is not None: assembled_vs_true_ref_sam_path, assembled_vs_true_ref_bam_path = align_minimap( output_dir=output_dir, ref_sequence_path=true_ref_sequence_path, reads_sequence_path=assembly_sequence_path) polished_ref_paths = list() for i in range(n_passes): suffix = str(i + 1) + "x" polish_output_dir = join(output_dir, suffix) FileManager.ensure_directory_exists(polish_output_dir) if i == 0: ref_sequence_path = assembly_sequence_path else: ref_sequence_path = polished_ref_paths[i - 1] reads_vs_polished_ref_sam_path, reads_vs_polished_ref_bam_path = align_minimap( output_dir=polish_output_dir, ref_sequence_path=ref_sequence_path, reads_sequence_path=reads_file_path) repolished_ref_sequence_path = polish_racon( output_dir=polish_output_dir, reads_file_path=reads_file_path, reads_vs_ref_sam_path=reads_vs_polished_ref_sam_path, ref_sequence_path=ref_sequence_path, suffix=suffix) polished_ref_paths.append(repolished_ref_sequence_path) if true_ref_sequence_path is not None: repolished_vs_true_ref_sam_path, repolished_vs_true_ref_bam_path = \ align_minimap(output_dir=polish_output_dir, ref_sequence_path=true_ref_sequence_path, reads_sequence_path=repolished_ref_sequence_path)
def main(ref_sequence_path, reads_sequence_path, minimap_preset, output_dir=None): if output_dir is None: output_dir = "./" else: FileManager.ensure_directory_exists(output_dir) output_sam_file_path, output_bam_file_path = align_minimap(output_dir=output_dir, ref_sequence_path=ref_sequence_path, reads_sequence_path=reads_sequence_path, preset=minimap_preset) process_bam(bam_path=output_bam_file_path, reference_path=ref_sequence_path, output_dir=output_dir)
def process_bam(bam_path, reference_path, output_dir=None): """ Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save plots :return: """ print("\n" + bam_path) if output_dir is None: output_dir = "variants/" # Make a subdirectory to contain everything datetime_string = FileManager.get_datetime_string() output_subdirectory = "variants_" + datetime_string output_dir = os.path.join(output_dir, output_subdirectory) FileManager.ensure_directory_exists(output_dir) bam_handler = BamHandler(bam_file_path=bam_path) fasta_handler = FastaHandler(reference_path) chromosome_names = fasta_handler.get_contig_names() chromosome_names = sort_chromosome_names(names=chromosome_names, prefix="chr") print("ref contig names:", chromosome_names) for chromosome_name in chromosome_names: print("Parsing alignments for ref contig:", chromosome_name) chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) start = 0 stop = chromosome_length reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) inserts, deletes, mismatches = parse_reads( reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name) export_variants_to_csv(output_dir=output_dir, chromosome_name=chromosome_name, mismatches=mismatches, inserts=inserts, deletes=deletes, merge=True)
def main(): output_dir = "output/" + "read_names_" + FileManager.get_datetime_string() output_filename = "read_names.txt" output_path = os.path.join(output_dir, output_filename) FileManager.ensure_directory_exists(output_dir) # STEP 1 # Find union of read names within runnie and fastq files fastq_path = "/home/ryan/data/Nanopore/ecoli/guppy/r94_ec_guppy_rad2.fastq" runnie_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_all.out" # name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path, # fastq_path=fastq_path, # runnie_path=runnie_path) # STEP 2 # Split sequence names into train/test partition name_intersection_path = "/home/ryan/code/runlength_analysis/output/read_names_2019_3_26_11_50_guppy_runnie_intersection/read_names.txt" names = read_names_from_file(name_intersection_path) names_train, names_test = partition_names(names) # STEP 3 # Extract names and write to files runnie_train_subset_path = extract_runnie_reads_by_name(runnie_path=runnie_path, output_dir=output_dir, output_filename_suffix="train", names=names_train) fastq_train_subset_path = extract_fastq_reads_by_name(fastq_path=fastq_path, output_dir=output_dir, output_filename_suffix="train", names=names_train) runnie_test_subset_path = extract_runnie_reads_by_name(runnie_path=runnie_path, output_dir=output_dir, output_filename_suffix="test", names=names_test) fastq_test_subset_path = extract_fastq_reads_by_name(fastq_path=fastq_path, output_dir=output_dir, output_filename_suffix="test", names=names_test) # STEP 4 # Verify name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path, fastq_path=fastq_train_subset_path, runnie_path=runnie_train_subset_path) name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path, fastq_path=fastq_test_subset_path, runnie_path=runnie_test_subset_path)
def save_numpy_matrix(output_dir, filename, matrix): array_file_extension = ".npz" # ensure chromosomal directory exists if not os.path.exists(output_dir): FileManager.ensure_directory_exists(output_dir) output_path_prefix = os.path.join(output_dir, filename) output_path = output_path_prefix + array_file_extension # write numpy arrays numpy.savez_compressed(output_path, a=matrix)
def __init__(self): self.datetime_string = '-'.join( list(map(str, datetime.datetime.now().timetuple()))[:-1]) self.subdirectory_name = "training_" + self.datetime_string self.output_directory_name = "output/" self.directory = path.join(self.output_directory_name, self.subdirectory_name) self.n_checkpoints = 0 FileManager.ensure_directory_exists(self.directory)
def process_bam(bam_path, reference_path, bac_path, output_dir=None): """ Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save stats :return: """ if output_dir is None: output_dir = "stats/" FileManager.ensure_directory_exists(output_dir) ref_fasta_handler = FastaHandler(reference_path) bac_fasta_handler = FastaHandler(bac_path) chromosome_names = ref_fasta_handler.get_contig_names() bac_names = bac_fasta_handler.get_contig_names() print(chromosome_names) print(bac_names) data_per_bac = defaultdict(list) for chromosome_name in chromosome_names: chromosome_length = ref_fasta_handler.get_chr_sequence_length( chromosome_name) start = 0 stop = chromosome_length ref_fasta_handler = FastaHandler(reference_file_path=reference_path) bam_handler = BamHandler(bam_file_path=bam_path) reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) read_data = parse_reads(reads=reads, fasta_handler=ref_fasta_handler, chromosome_name=chromosome_name) for data in read_data: data_per_bac[data[0]].append([chromosome_name] + data) # filtered_data = filter_supplementaries_by_largest(data_per_bac) filtered_data = aggregate_bac_data(data_per_bac) export_bac_data_to_csv(read_data=filtered_data, output_dir=output_dir, bam_path=bam_path)
def process_bam(bam_path, reference_path, output_dir=None, centromere_table_path=None, gap_table_path=None, segdup_table_path=None, max_threads=None): """ Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save plots :return: """ print("\n" + bam_path) if max_threads is None: max_threads = max(1, cpu_count() - 2) if output_dir is None: output_dir = "plots/" process_manager = Manager() genome_data = process_manager.list() FileManager.ensure_directory_exists(output_dir) fasta_handler = FastaHandler(reference_path) chromosome_names = fasta_handler.get_contig_names() arguments = list() for chromosome_name in chromosome_names: arguments.append([ bam_path, reference_path, chromosome_name, output_dir, centromere_table_path, gap_table_path, segdup_table_path, genome_data ]) if len(arguments) < max_threads: max_threads = len(arguments) print("Using %d threads..." % max_threads) with Pool(processes=max_threads) as pool: pool.starmap(get_chromosome_data, arguments) export_genome_summary_to_csv(bam_path=bam_path, output_dir=output_dir, genome_data=genome_data)
def process_bam(bam_path, reference_path, max_threads, output_dir=None): """ Find useful summary data from a bam that can be represented as a table of identities/matches/mismatches/indels :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save stats :return: """ if output_dir is None: output_dir = "stats/" if max_threads is None: max_threads = max(1, cpu_count() - 2) process_manager = Manager() genome_data = process_manager.list() FileManager.ensure_directory_exists(output_dir) fasta_handler = FastaHandler(reference_path) chromosome_names = fasta_handler.get_contig_names() arguments = list() for chromosome_name in chromosome_names: chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) start = 0 stop = chromosome_length arguments.append([ genome_data, reference_path, chromosome_name, start, stop, output_dir, bam_path ]) if len(arguments) < max_threads: print("Fewer jobs than threads") max_threads = len(arguments) print("Using %d threads..." % max_threads) with Pool(processes=max_threads) as pool: pool.starmap(get_chromosome_stats, arguments) print("genome_data", genome_data) export_genome_summary_to_csv(bam_path=bam_path, output_dir=output_dir, genome_data=genome_data)
def main(reads_file_path, genome_size=None, output_dir=None): if output_dir is None: output_dir = "./" else: FileManager.ensure_directory_exists(output_dir) if genome_size is None: genome_size = "3g" print( "WARNING: genome size flag not specified, defaulting to human size (3g)" ) assembly_sequence_path = assemble_wtdbg2(output_dir=output_dir, input_file_path=reads_file_path, genome_size=genome_size)
def plot_kernels_and_column_frequencies(kernel_sums, passing_indices, column_frequencies, slice_range=None, save=False, output_dir=None, filename=None): if slice_range is not None: kernel_sums = kernel_sums[:, slice_range[0]:slice_range[1]] passing_indices = passing_indices[:, slice_range[0]:slice_range[1]] column_frequencies = column_frequencies[:, slice_range[0]:slice_range[1]] kernel_sums.reshape(1, kernel_sums.shape[1]) passing_indices.reshape(1, passing_indices.shape[1]) column_frequencies.reshape(column_frequencies.shape[0], column_frequencies.shape[1]) fig, axes = pyplot.subplots(nrows=3, sharex=True) fig.set_size_inches(16, 4) axes[0].imshow(passing_indices) axes[1].imshow(kernel_sums) axes[2].imshow(column_frequencies) axes[0].set_ylabel("Thresholded") axes[1].set_ylabel("Convolution") axes[2].set_ylabel("Frequencies") axes[0].set_yticklabels([]) axes[1].set_yticklabels([]) axes[2].set_yticklabels([]) axes[0].set_yticks([]) axes[1].set_yticks([]) axes[2].set_yticks([]) if save: FileManager.ensure_directory_exists(output_dir) filename = filename + "_kernels.png" path = os.path.join(output_dir, filename) pyplot.savefig(path) else: pyplot.show() pyplot.close()
def main(ref_sequence_path, reads_sequence_path, max_threads=None, output_dir=None, minimap_preset="map-ont", k=15): if output_dir is None: output_dir = "./" else: FileManager.ensure_directory_exists(output_dir) reads_vs_ref_bam_path = align_minimap( output_dir=output_dir, ref_sequence_path=ref_sequence_path, reads_sequence_path=reads_sequence_path, preset=minimap_preset, max_threads=max_threads, k=k)
def main(): matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_shasta_human_chr1_GM24143/frequency_matrices_genomic_2019_4_23_21_38_57_361118.csv" output_dir = "output/runlength_matrix_shasta_human_chr1_GM24143/pseudocounts/" output_filename_prefix = "probability_matrices_GM24143_chr1_shasta_pseudocount_" FileManager.ensure_directory_exists(output_dir) matrix = load_base_length_matrix_from_csv(path=matrix_path, max_runlength=50) pseudocounts = [1, 4, 8, 16] for pseudocount in pseudocounts: filename = output_filename_prefix + str(pseudocount) + ".csv" save_nondirectional_frequency_matrices_as_delimited_text(output_dir=output_dir, frequency_matrices=matrix, chromosome_name="genomic", log_normalize=True, filename=filename, pseudocount=pseudocount, plot=False)
def write_joint_distribution_to_file(distribution, output_dir): FileManager.ensure_directory_exists(output_dir) datetime_string = FileManager.get_datetime_string() filename_prefix = "joint_distribution" filename = filename_prefix + "_" + datetime_string + ".tsv" path = os.path.join(output_dir, filename) with open(path, 'w') as file: writer = csv.writer(file, delimiter="\t") for pair in sorted(distribution.keys()): line = [ pair[0][0], pair[0][1], pair[1][0], pair[1][1], distribution[pair] ] writer.writerow(line) return path
def test_window(bam_file_path, reference_file_path, chromosome_name, window, output_dir, save_data=True, print_results=False): """ Run the pileup generator for a single specified window :param bam_file_path: :param reference_file_path: :param chromosome_name: :param window: :return: """ bam_handler = BamHandler(bam_file_path) fasta_handler = FastaHandler(reference_file_path) pileup_start = window[0] pileup_end = window[1] # add random variation here ? ref_sequence, read_ids, sequences = get_aligned_segments( fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end) if print_results: print_segments(ref_sequence, sequences) if save_data: filename = "test_" + str(pileup_start) + ".fasta" output_path = os.path.join(output_dir, filename) if not os.path.exists(output_dir): FileManager.ensure_directory_exists(output_dir) fasta_writer = FastaWriter(output_path) fasta_writer.write_sequences(sequences)
def main(): # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta" # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" runlength_path = "/home/ryan/code/runlength_analysis/data/runnie_subset_test_flipflop_regional_0to10k.out" pileup_start = 6000 pileup_end = 6050 output_parent_dir = "output/" output_dir = "runlength_pileup_test_" + FileManager.get_datetime_string() output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join(os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) assembly_fasta_filename_prefix = ".".join(os.path.basename(runlength_path).split(".")[:-1]) runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta" runlength_assembly_fasta_path = os.path.join(output_dir, runlength_assembly_fasta_filename) handler = RunlengthHandler(runlength_path) reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True) read_data = dict() for r, read in enumerate(reads): read_data[read.id] = read print("\nRLE encoding reference sequence...") runlength_ref_sequences = runlength_encode_fasta(fasta_sequence_path=ref_fasta_path) assembly_vs_ref_bam_path = align_as_RLE(runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_assembly_fasta_path, runlength_read_sequences=read_data, output_dir=output_dir) bam_handler = BamHandler(assembly_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_scales, aligned_shapes, reversal_statuses = \ get_aligned_segments(fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, runlength_ref_sequences=runlength_ref_sequences, read_data=read_data) sequence_encoding = list() scale_encoding = list() shape_encoding = list() modes_encoding = list() print(len(aligned_sequences.keys())) print("REF\t", "".join(aligned_ref_sequence)) for read_id in aligned_sequences.keys(): print("READ\t%s\t%s" % (read_id, "".join(aligned_sequences[read_id]))) sequence_encoding.append(list(map(get_encoding, aligned_sequences[read_id]))) scale_encoding.append(aligned_scales[read_id]) shape_encoding.append(aligned_shapes[read_id]) modes_encoding.append(list(map(map_parameters_to_mode, zip(aligned_scales[read_id], aligned_shapes[read_id])))) sequence_encoding = -numpy.array(sequence_encoding, dtype=numpy.float) scale_encoding = numpy.array(scale_encoding, dtype=numpy.float) shape_encoding = numpy.array(shape_encoding, dtype=numpy.float) modes_encoding = numpy.array(modes_encoding, dtype=numpy.float) plot_runlength_pileup(sequences=sequence_encoding, scales=scale_encoding, shapes=shape_encoding, modes=modes_encoding)
def main(reference_file_path): input_prefix_name = os.path.basename(reference_file_path).split("/")[-1].split(".")[0] output_dir = os.path.join("output/ref_run_lengths/", input_prefix_name) filename_prefix = "ref_runlength_distribution" FileManager.ensure_directory_exists(output_dir) fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() print(contig_names) print(sorted([(x,fasta_handler.get_chr_sequence_length(x)) for x in contig_names],key=lambda x: x[1])) all_counts = defaultdict(lambda: Counter()) raw_counts_AT = list() raw_counts_GC = list() sys.stderr.write("reading fasta file...\n") sys.stderr.flush() max_count = 100 step = 1 c = 0 for chromosome_name in contig_names: # if len(contig_names) > 1: # if not chromosome_name.startswith("chr") or "alt" in chromosome_name or "v" in chromosome_name: # print("WARNING: SKIPPING CHROMOSOME %s" % chromosome_name) # continue # if c == 1: # break c += 1 sys.stderr.write("Parsing chromosome %s\n" % chromosome_name) sys.stderr.flush() chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reference_sequence = fasta_handler.get_sequence(chromosome_name=chromosome_name, start=0, stop=chromosome_length) character_counts = count_runlength_per_character(reference_sequence) figure, axes = pyplot.subplots(nrows=len(character_counts.keys()), sharex=True) figure.set_size_inches(6,12) for k,key in enumerate(character_counts.keys()): counts = character_counts[key] counter = Counter(counts) all_counts[key] += counter if key in {"C","G"}: raw_counts_GC += counts if key in {"A","T"}: raw_counts_AT += counts plot_counts_as_histogram(axes=axes[k], counts=counts, max_count=max_count, step=step) axes[k].set_ylabel(str(key)) axes[k].set_ylim([-0.5,10]) axes[0].set_title(chromosome_name) filename = filename_prefix + "_" + chromosome_name + ".png" file_path = os.path.join(output_dir, filename) figure.savefig(file_path) # pyplot.show() pyplot.close() figure, axes = pyplot.subplots(nrows=2) filename = filename_prefix + "_genomic.png" file_path = os.path.join(output_dir, filename) plot_counts_as_histogram(axes=axes[0], counts=raw_counts_AT, max_count=max_count, step=step) plot_counts_as_histogram(axes=axes[1], counts=raw_counts_GC, max_count=max_count, step=step) axes[0].set_ylabel("AT Log10 Frequency") axes[1].set_ylabel("GC Log10 Frequency") figure.savefig(file_path) # pyplot.show() pyplot.close() print_all_counts_as_shasta_matrix(all_counts, max_count=50) print_all_counts(all_counts, output_dir)
def generate_ngx_plot(assembly_contigs, input_dir, genome_size=None, y_max=180, title="NGx", figure=None, axes=None): samples = [ "03492", "03098", "02723", "02080", "02055", "01243", "01109", "00733", "24385", "24149", "24143", "CHM13", "hg38_no_alts" ] colors = [ (175 / 256.0, 48 / 256.0, 51 / 256.0), # red (224 / 256.0, 99 / 256.0, 58 / 256.0), # orange (215 / 256.0, 219 / 256.0, 84 / 256.0), # yellow (110 / 256.0, 170 / 256.0, 100 / 256.0), # light green (80 / 256.0, 180 / 256.0, 150 / 256.0), # green (100 / 256.0, 189 / 256.0, 197 / 256.0), # green-blue (0 / 256.0, 170 / 256.0, 231 / 256.0), # turquoise (51 / 256.0, 87 / 256.0, 182 / 256.0), # blue (37 / 256.0, 36 / 256.0, 93 / 256.0), # indigo (95 / 256.0, 51 / 256.0, 139 / 256.0), # purple (200 / 256.0, 53 / 256.0, 93 / 256.0), # pink (224 / 256.0, 99 / 256.0, 58 / 256.0), (110 / 256.0, 170 / 256.0, 100 / 256.0) ] alphas = [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 1.0, 0.3, 0.3, 0.3, 1.0, 1.0] zorders = [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1] labels = {} # --------------------------------------------------------------------------- # samples = ["shasta", "wtdbg2", "canu", "flye"] # # colors = [(0.890,0.120,0.031), # (0.999,0.696,0.031), # (112/256, 37/256, 163/256) # (0.039,0.463,0.58), # (0.024,0.69,0.224)] # # zorders = [1,0,0,0] # alphas = [1,0.9,1,1] # # labels = {} # --------------------------------------------------------------------------- # # samples = ["shasta", "hifi"] # # colors = [(0.933,0.153,0.031), # (112/256, 37/256, 163/256), # (0.039,0.463,0.58), # (0.024,0.69,0.224)] # # zorders = [1,1] # alphas = [1,1] # # labels = {} # --------------------------------------------------------------------------- # samples = ["assembly_GM24385", # "assembly_HG00733", # "scaffold_GM24385", # "scaffold_HG00733"] # # labels = {} # # colors = [(51/256.0, 87/256.0, 182/256.0), # blue # (51/256.0, 87/256.0, 182/256.0), # green-blue # # (200/256.0, 200/256.0, 200/256.0), # grey # (100/256.0, 189/256.0, 197/256.0), # orange # (100/256.0, 189/256.0, 197/256.0)] # light green # # zorders = [1,1,1,1] # alphas = [0.5,1,0.5,1] # --------------------------------------------------------------------------- if genome_size is None: print("WARNING: genome_size unspecified, using human as default") genome_size = 3.23 * 1000**3 if y_max is None: print("WARNING: y_max unspecified, using 180Mbp as default") y_max = 180 if figure is None and axes is None: figure = pyplot.figure() axes = pyplot.axes() legend_names = list() for path, contigs in sorted(assembly_contigs.items(), key=lambda x: x[0]): print("Plotting assembly: %s" % path) sample_matched = False for name in samples: if name.lower() in path.lower(): sample_index = samples.index(name) color = colors[sample_index] alpha = alphas[sample_index] zorder = zorders[sample_index] sample_name = name sample_matched = True if not sample_matched: print("ERROR: color not found for %s" % path) sample_index = 0 color = colors[sample_index] alpha = alphas[sample_index] zorder = zorders[sample_index] sample_name = os.path.basename(path).split(".")[0] if sample_name in labels: label = labels[sample_name] else: label = sample_name x1 = 0 y_prev = None x_coords = list() y_coords = list() for contig in contigs: y = contig[LENGTH] width = contig[LENGTH] / genome_size x2 = x1 + width if y_prev is not None: x_coords.extend([x1, x1]) y_coords.extend([y_prev, y]) x_coords.extend([x1, x2]) y_coords.extend([y, y]) x1 = x2 y_prev = y if y_coords[-1] != 0: y_coords.append(0) x_coords.append(x_coords[-1]) dashes = [1, 0, 1, 0] if "hifi" in path.lower(): label = "Canu CCS" if "shasta" in path: label = "Shasta Nanopore" if label not in legend_names: legend_names.append(label) axes.plot(x_coords, y_coords, color=color, alpha=alpha, zorder=zorder, dashes=dashes, linewidth=0.6) axes.legend(legend_names) axes.axvline(0.5, linestyle="--", alpha=0.3, linewidth=0.7, zorder=-1) # max_size = y_max # # step_size = 20 # if step_size >= y_max: # step_size = 1 # # scale = 1_000_000 # # axes.set_xlim([0,1]) # axes.set_ylim([0,max_size*scale]) # axes.set_yticks(numpy.arange(0,max_size+step_size,step_size)*scale) # axes.set_yticklabels(numpy.arange(0,max_size+step_size,step_size)) axes.set_title(title) axes.set_ylabel("Contig/scaffold size (Mbp)") axes.set_xlabel("Cumulative coverage") FileManager.ensure_directory_exists("output") output_dir = "output/" filename = input_dir.rstrip("/").split( "/")[-1] + "_" + FileManager.get_datetime_string() file_path = os.path.abspath(os.path.join(output_dir, filename)) print("SAVING FIGURE: %s" % file_path) figure.savefig(file_path + ".png", dpi=300) figure.savefig(file_path + ".pdf", dpi=300) pyplot.close()
def main(): # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_ref.fasta" # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_reads.fasta" # matrix_path = "/home/ryan/code/runnie_parser/output/runlength_matrix_from_assembly_contigs_2019_3_19_13_29_14_657613/probability_matrices_2019_3_19_13_29_19_362916.csv" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" read_fasta_path = "/home/ryan/code/runlength_analysis/data/sequence_subset_ecoli_guppy-runnie_60x_test.fastq" matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_sequence_2019_4_5_15_29_28_403950/probability_matrices_2019_4_5_15_35_57_920301.csv" output_parent_dir = "output/" output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) read_fasta_filename_prefix = ".".join( os.path.basename(read_fasta_path).split(".")[:-1]) runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta" runlength_read_fasta_path = os.path.join(output_dir, runlength_read_fasta_filename) runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) runlength_read_sequences = runlength_encode_fasta( fasta_sequence_path=read_fasta_path) read_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_read_fasta_path, runlength_read_sequences=runlength_read_sequences, output_dir=output_dir) bam_handler = BamHandler(read_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) windows = chunk_chromosome_coordinates(chromosome_length=chromosome_length, chunk_size=1000) # Initialize empty confusion matrices total_confusion = get_runlength_confusion([], [], 10) total_modal_confusion = get_runlength_confusion([], [], 10) length_classifier = RunlengthClassifier(matrix_path) print("reading BAM") for pileup_start, pileup_end in windows[:10]: print("window", pileup_start, pileup_end) sys.stderr.write("\r%s" % pileup_start) aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_lengths, reversal_statuses = \ get_aligned_segments(fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, runlength_ref_sequences=runlength_ref_sequences, read_data=runlength_read_sequences) sequence_encoding = list() length_encoding = list() reversal_encoding = list() # No reads here? if len(aligned_sequences) == 0: continue # print("REF\t", "".join(aligned_ref_sequence)) for read_id in aligned_sequences.keys(): # print("READ\t","".join(aligned_sequences[read_id])) sequence_encoding.append( list(map(get_encoding, aligned_sequences[read_id]))) length_encoding.append(aligned_lengths[read_id]) reversal_encoding.append(reversal_statuses[read_id]) ref_sequence_encoding = [list(map(get_encoding, aligned_ref_sequence))] ref_lengths_encoding = [aligned_ref_lengths] ref_sequence_encoding = numpy.array(ref_sequence_encoding, dtype=numpy.int) ref_length_encoding = numpy.array(ref_lengths_encoding, dtype=numpy.int) sequence_encoding = numpy.array(sequence_encoding, dtype=numpy.int) length_encoding = numpy.array(length_encoding, dtype=numpy.float) reversal_encoding = numpy.array(reversal_encoding, dtype=numpy.bool) ref_sequence_encoding = numpy.atleast_2d(ref_sequence_encoding) ref_length_encoding = numpy.atleast_2d(ref_length_encoding) sequence_encoding = numpy.atleast_2d(sequence_encoding) length_encoding = numpy.atleast_2d(length_encoding) # plot_runlength_pileup(sequences=-sequence_encoding, # lengths=length_encoding, # ref_sequence=-ref_sequence_encoding, # ref_lengths=ref_length_encoding) consensus_sequence, consensus_lengths = \ get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier, sequence_encoding=sequence_encoding, length_encoding=length_encoding, reversal_encoding=reversal_encoding) modal_consensus_sequence, modal_consensus_lengths = \ get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier, sequence_encoding=sequence_encoding, length_encoding=length_encoding, reversal_encoding=reversal_encoding, bayesian=False) print() print("PREDICTED\t", consensus_lengths[:10]) print("TRUE\t\t", aligned_ref_lengths[:10]) confusion = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=consensus_lengths, max_length=10) total_confusion += confusion modal_confusion = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=modal_consensus_lengths, max_length=10) total_modal_confusion += modal_confusion # except Exception as e: # print(e) # continue print() accuracy = get_accuracy_from_confusion_matrix(total_confusion) print("Bayes:", accuracy) accuracy = get_accuracy_from_confusion_matrix(total_modal_confusion) print("No Bayes", accuracy) plot_filename = "confusion.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_confusion)) pyplot.show() figure.savefig(plot_path) pyplot.close() plot_filename = "modal_confusion.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_modal_confusion)) pyplot.show() figure.savefig(plot_path) pyplot.close()
def main(): ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" read_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/guppy/subsampled/11-29/r94_ec_rad2.30x-30kb.fasta" # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/v2/rad2_pass_runnie_0_1_10_11_12_13_v2.fa" # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_14_29_ecoli_wg_guppy_NO_BAYES/Assembly.fasta" # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_15_40_ecoli_wg_guppy_BAYES/Assembly.fasta" # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/rad2_pass_runnie_0_v2.fa" # ---- TEST DATA ---- # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_ref.fasta" # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_reads.fasta" # ------------------- output_parent_dir = "output/" output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) read_fasta_filename_prefix = ".".join( os.path.basename(read_fasta_path).split(".")[:-1]) runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta" runlength_read_fasta_path = os.path.join(output_dir, runlength_read_fasta_filename) sys.stderr.write("RL encoding fasta...\n") runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) runlength_read_sequences = runlength_encode_fasta( fasta_sequence_path=read_fasta_path) sys.stderr.write("Aligning RLE fasta...\n") read_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_read_fasta_path, runlength_read_sequences=runlength_read_sequences, output_dir=output_dir) bam_handler = BamHandler(read_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) print(chromosome_length) sequences, lengths = get_read_segments( fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=100000, pileup_end=100000 + 100, runlength_ref_sequences=runlength_ref_sequences, read_data=runlength_read_sequences) for k, key in enumerate(sequences): print(key) print(sequences[key][:10]) print(lengths[key][:10])