def save_directional_frequency_matrices_as_delimited_text( output_dir, frequency_matrices, delimiter=",", log_normalize=False, plot=False): if log_normalize: filename = "probability_matrices_directional_" + FileManager.get_datetime_string( ) + ".csv" else: filename = "frequency_matrices_directional_" + FileManager.get_datetime_string( ) + ".csv" reversal_suffixes = ["F", "R"] output_path = os.path.join(output_dir, filename) file = open(output_path, "w") for reversal in [0, 1]: for base_index in range(4): base = INDEX_TO_BASE[base_index] suffix = reversal_suffixes[reversal] matrix = numpy.squeeze(frequency_matrices[reversal, base_index, :, :]) type = int if log_normalize: matrix = normalize(matrix, pseudocount=15) type = float if plot: pyplot.imshow(matrix) pyplot.show() pyplot.close() matrix_name = "_".join([base, suffix]) header = ">" + matrix_name + "\n" file.write(header) for r in range(matrix.shape[0]): row = [str(type(x)) for x in matrix[r]] # print(r, len(row)) row = delimiter.join(row) + "\n" file.write(row) file.write("\n") file.close()
def main(max_threads=None): # runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_runnie_0.out" runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_runnie_0_1_10_11_12_13.out" output_parent_dir = "output/version_comparison/mode/" output_dir = "runlength_matrix_from_assembly_contigs_" + FileManager.get_datetime_string() output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) handler = RunlengthHandler(runlength_path) if max_threads is None: max_threads = max(1, multiprocessing.cpu_count()-2) with multiprocessing.Pool(processes=max_threads) as pool: for r,read_id in enumerate(pool.imap(arg_unpacker, arg_iterator(handler=handler, output_dir=output_dir))): sys.stdout.write("\r%d" % r) print() print("Concatenating files...") output_file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=output_dir, file_extension=".fasta") concatenated_filename = os.path.basename(runlength_path).split(".")[0] + ".fasta" concatenated_file_path = os.path.join(output_dir, concatenated_filename) print("Saving to file: %s" % concatenated_file_path) FileManager.concatenate_files(file_paths=output_file_paths, output_file_path=concatenated_file_path) FileManager.delete_files(output_file_paths)
def run_batch_training_from_tuples(): # chr_paths = ["/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_40_980920/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_42_138805/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_43_176010/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_44_574894/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_46_366545/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_47_822627/"] chr_paths = ["output/joint_runlength_base_model/2018_11_12_14_23_56_638745/"] trainer = JointClassifierTrainer() all_file_paths = list() for path in chr_paths: file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=path, file_extension=".pkl") all_file_paths.extend(file_paths) counts = trainer.get_counts_from_tuples(paths=all_file_paths) distribution = trainer.train_model(counts) distribution_output_dir = "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/distribution/" distribution_filename = "distribution_" + FileManager.get_datetime_string() print("\nSAVING: ", os.path.join(distribution_output_dir, distribution_filename)) FileManager.save_object_pickle(object=distribution, filename=distribution_filename, output_dir=distribution_output_dir)
def main(): """ Make a synthetic reference and a set of reads and save them to fasta files as reads.fasta and ref.fasta :return: """ output_dir = "data/" FileManager.ensure_directory_exists(output_dir) n_coverage = 2 ref_max_runlength = 50 read_max_runlength = 50 ref_sequence, observations = generate_sequences( ref_max_runlength=ref_max_runlength, read_max_runlength=read_max_runlength, n_coverage=n_coverage, scale_coverage=True) datetime_string = FileManager.get_datetime_string() filename = "synthetic_coverage_data_marginpolish_" + datetime_string + ".tsv" output_path = os.path.join(output_dir, filename) file = open(output_path, "w") writer = csv.writer(file, delimiter="\t") for line in observations: writer.writerow(line) file.close() filename = "synthetic_coverage_data_marginpolish_" + datetime_string + "_ref.fasta" output_path = os.path.join(output_dir, filename) with open(output_path, "w") as file: file.write(">ref_0\n") file.write(ref_sequence)
def train_joint_model_from_tuples(tuples_path): training_tuples = load_training_tuples(tuples_path, cutoff=16) print("training tuples loaded: ", len(training_tuples)) distribution = train_model(data=training_tuples) distribution_output_dir = "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/distribution/" distribution_filename = "distribution_" + FileManager.get_datetime_string() FileManager.save_object_pickle(object=distribution, filename=distribution_filename, output_dir=distribution_output_dir)
def process_bam(bam_path, reference_path, output_dir=None): """ Find useful summary data from a bam that can be represented as a table of identities, and a plot of alignments :param bam_path: path to a bam containing contigs aligned to a true reference :param reference_path: the true reference that contigs were aligned to :param output_dir: where to save plots :return: """ print("\n" + bam_path) if output_dir is None: output_dir = "variants/" # Make a subdirectory to contain everything datetime_string = FileManager.get_datetime_string() output_subdirectory = "variants_" + datetime_string output_dir = os.path.join(output_dir, output_subdirectory) FileManager.ensure_directory_exists(output_dir) bam_handler = BamHandler(bam_file_path=bam_path) fasta_handler = FastaHandler(reference_path) chromosome_names = fasta_handler.get_contig_names() chromosome_names = sort_chromosome_names(names=chromosome_names, prefix="chr") print("ref contig names:", chromosome_names) for chromosome_name in chromosome_names: print("Parsing alignments for ref contig:", chromosome_name) chromosome_length = fasta_handler.get_chr_sequence_length( chromosome_name) start = 0 stop = chromosome_length reads = bam_handler.get_reads(chromosome_name=chromosome_name, start=start, stop=stop) inserts, deletes, mismatches = parse_reads( reads=reads, fasta_handler=fasta_handler, chromosome_name=chromosome_name) export_variants_to_csv(output_dir=output_dir, chromosome_name=chromosome_name, mismatches=mismatches, inserts=inserts, deletes=deletes, merge=True)
def main(): output_dir = "output/" + "read_names_" + FileManager.get_datetime_string() output_filename = "read_names.txt" output_path = os.path.join(output_dir, output_filename) FileManager.ensure_directory_exists(output_dir) # STEP 1 # Find union of read names within runnie and fastq files fastq_path = "/home/ryan/data/Nanopore/ecoli/guppy/r94_ec_guppy_rad2.fastq" runnie_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/rad2_pass_all.out" # name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path, # fastq_path=fastq_path, # runnie_path=runnie_path) # STEP 2 # Split sequence names into train/test partition name_intersection_path = "/home/ryan/code/runlength_analysis/output/read_names_2019_3_26_11_50_guppy_runnie_intersection/read_names.txt" names = read_names_from_file(name_intersection_path) names_train, names_test = partition_names(names) # STEP 3 # Extract names and write to files runnie_train_subset_path = extract_runnie_reads_by_name(runnie_path=runnie_path, output_dir=output_dir, output_filename_suffix="train", names=names_train) fastq_train_subset_path = extract_fastq_reads_by_name(fastq_path=fastq_path, output_dir=output_dir, output_filename_suffix="train", names=names_train) runnie_test_subset_path = extract_runnie_reads_by_name(runnie_path=runnie_path, output_dir=output_dir, output_filename_suffix="test", names=names_test) fastq_test_subset_path = extract_fastq_reads_by_name(fastq_path=fastq_path, output_dir=output_dir, output_filename_suffix="test", names=names_test) # STEP 4 # Verify name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path, fastq_path=fastq_train_subset_path, runnie_path=runnie_train_subset_path) name_intersection_path = find_intersection_of_runnie_and_fastq(output_path=output_path, fastq_path=fastq_test_subset_path, runnie_path=runnie_test_subset_path)
def run_generate_tuples_from_pileups(): max_threads = 6 # NC_003279.8 Caenorhabditis elegans chromosome I # NC_003280.10 Caenorhabditis elegans chromosome II # NC_003281.10 Caenorhabditis elegans chromosome III # NC_003282.8 Caenorhabditis elegans chromosome IV # NC_003283.11 Caenorhabditis elegans chromosome V # NC_003284.9 Caenorhabditis elegans chromosome X # NC_001328.1 Caenorhabditis elegans mitochondrion, complete genome # data_path = ["/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8", # "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003280.10", # "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003281.10", # "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003282.8", # "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003283.11", # "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003284.9"] data_path = ["/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-11-12-14-8-24-0-316/gi"] args = list() for path in data_path: gap_filterer = GapFilterer() batch_size = 1 file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=path, file_extension=".npz") data_loader = DataLoader(file_paths, batch_size=batch_size, parse_batches=False) consensus_caller = ConsensusCaller(sequence_to_index=sequence_to_index, sequence_to_float=sequence_to_float) output_dir = "output/joint_runlength_base_model/" + FileManager.get_datetime_string() filename_suffix = path.split("/")[-1] print(filename_suffix) args.append([data_loader, batch_size, consensus_caller, output_dir, filename_suffix, gap_filterer]) gap_filterer = None gc.collect() n_threads = min(len(args), max_threads) for arg in args: print(arg) print(n_threads) with Pool(processes=n_threads) as pool: pool.starmap(generate_training_data, args)
def plot_joint_distribution(distribution, save=False): base_distributions = defaultdict(lambda: numpy.zeros([max_runlength + 1, max_runlength + 1])) print(len(distribution)) max_runlength = 50 for true_base in ["A", "G", "T", "C", "-"]: # base_self_distribution = numpy.zeros([max_runlength + 1, max_runlength + 1]) for observed_base in ["A", "G", "T", "C", "-"]: for r_x, observed_repeat in enumerate(range(0, max_runlength+1)): for r_y, true_repeat in enumerate(range(0, max_runlength+1)): key = ((observed_base, observed_repeat),(true_base, true_repeat)) if key in distribution: probability = distribution[key] if true_base == "-" and observed_base != "-": base_distributions[observed_base][r_y, r_x] += probability elif true_base == "-" and observed_base == "-": for split_base in ["A", "G", "T", "C"]: base_distributions[split_base][r_y, r_x] += probability else: base_distributions[true_base][r_y,r_x] += probability # base_distributions["A"][25, 0] += 999999 for base in base_distributions: axes = pyplot.axes() base_distribution = normalize_frequency_matrix(base_distributions[base], log_scale=True) pyplot.title(base + ":" + base + " Log probabilities") pyplot.imshow(numpy.log10(base_distributions[base])) axes.set_xlabel("Observed length") axes.set_ylabel("True length") pyplot.show() pyplot.close() if save: output_dir = "/home/ryan/code/nanopore_assembly/models/parameters/" filename = "runlength_frequency_matrices_per_base_" + FileManager.get_datetime_string() print("SAVING: ", output_dir + filename) save_numpy_matrices(output_dir=output_dir, filename=filename, matrices=base_distributions)
def write_chromosomal_summary_data_to_csv(summary_headers, summary_data, output_dir, sample_name=None): if sample_name is None: sample_name = FileManager.get_datetime_string() filename = "aggregate_summary_" + sample_name + ".csv" file_path = os.path.join(output_dir, filename) print("Saving aggregate data to: %s" % os.path.abspath(file_path)) with open(file_path, "w") as file: writer = csv.writer(file) writer.writerow(summary_headers) for data in summary_data: writer.writerow(data)
def write_joint_distribution_to_file(distribution, output_dir): FileManager.ensure_directory_exists(output_dir) datetime_string = FileManager.get_datetime_string() filename_prefix = "joint_distribution" filename = filename_prefix + "_" + datetime_string + ".tsv" path = os.path.join(output_dir, filename) with open(path, 'w') as file: writer = csv.writer(file, delimiter="\t") for pair in sorted(distribution.keys()): line = [ pair[0][0], pair[0][1], pair[1][0], pair[1][1], distribution[pair] ] writer.writerow(line) return path
def main(): # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta" # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" runlength_path = "/home/ryan/code/runlength_analysis/data/runnie_subset_test_flipflop_regional_0to10k.out" pileup_start = 6000 pileup_end = 6050 output_parent_dir = "output/" output_dir = "runlength_pileup_test_" + FileManager.get_datetime_string() output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join(os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) assembly_fasta_filename_prefix = ".".join(os.path.basename(runlength_path).split(".")[:-1]) runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta" runlength_assembly_fasta_path = os.path.join(output_dir, runlength_assembly_fasta_filename) handler = RunlengthHandler(runlength_path) reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True) read_data = dict() for r, read in enumerate(reads): read_data[read.id] = read print("\nRLE encoding reference sequence...") runlength_ref_sequences = runlength_encode_fasta(fasta_sequence_path=ref_fasta_path) assembly_vs_ref_bam_path = align_as_RLE(runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_assembly_fasta_path, runlength_read_sequences=read_data, output_dir=output_dir) bam_handler = BamHandler(assembly_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_scales, aligned_shapes, reversal_statuses = \ get_aligned_segments(fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, runlength_ref_sequences=runlength_ref_sequences, read_data=read_data) sequence_encoding = list() scale_encoding = list() shape_encoding = list() modes_encoding = list() print(len(aligned_sequences.keys())) print("REF\t", "".join(aligned_ref_sequence)) for read_id in aligned_sequences.keys(): print("READ\t%s\t%s" % (read_id, "".join(aligned_sequences[read_id]))) sequence_encoding.append(list(map(get_encoding, aligned_sequences[read_id]))) scale_encoding.append(aligned_scales[read_id]) shape_encoding.append(aligned_shapes[read_id]) modes_encoding.append(list(map(map_parameters_to_mode, zip(aligned_scales[read_id], aligned_shapes[read_id])))) sequence_encoding = -numpy.array(sequence_encoding, dtype=numpy.float) scale_encoding = numpy.array(scale_encoding, dtype=numpy.float) shape_encoding = numpy.array(shape_encoding, dtype=numpy.float) modes_encoding = numpy.array(modes_encoding, dtype=numpy.float) plot_runlength_pileup(sequences=sequence_encoding, scales=scale_encoding, shapes=shape_encoding, modes=modes_encoding)
def main(): # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta" # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out" # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runnie_test_2019_4_8_14_33_30_333396_ref.fasta" # runlength_path = "/home/ryan/code/runlength_analysis/data/synthetic_runnie_test_2019_4_8_14_33_30_333396_runnie.out" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" runlength_path = "/home/ryan/code/runlength_analysis/data/runnie_subset_test_flipflop_regional_0to10k.out" # WG ecoli 60x matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_WG_train_60x_guppy_2019_4_23/probability_matrices_2019_4_23_15_9_14_837893.csv" raw_matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_runnie_WG_train_60x_guppy_2019_4_23/frequency_matrices_2019_4_23_15_9_14_833128.csv" output_parent_dir = "output/" output_dir = "runlength_prediction_from_runnie_output_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) assembly_fasta_filename_prefix = ".".join( os.path.basename(runlength_path).split(".")[:-1]) runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta" runlength_assembly_fasta_path = os.path.join( output_dir, runlength_assembly_fasta_filename) handler = RunlengthHandler(runlength_path) reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True) read_data = dict() for r, read in enumerate(reads): read_data[read.id] = read print("\nRLE encoding reference sequence...") runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) assembly_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_assembly_fasta_path, runlength_read_sequences=read_data, output_dir=output_dir) bam_handler = BamHandler(assembly_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) windows = chunk_chromosome_coordinates(chromosome_length=chromosome_length, chunk_size=1000) total_confusion = get_runlength_confusion([], [], 10) total_confusion_weibull = get_runlength_confusion([], [], 10) length_classifier = RunlengthClassifier(matrix_path) # length_classifier_weibull = WeibullRunlengthClassifier(matrix_path) length_classifier_weibull = WeibullRunlengthClassifier( raw_matrix_path, normalize_matrix=True, pseudocount=0.05) print("reading BAM") for pileup_start, pileup_end in windows[10:20]: sys.stderr.write("\r%s" % pileup_start) aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_scales, aligned_shapes, reversal_statuses = \ get_aligned_segments(fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, runlength_ref_sequences=runlength_ref_sequences, read_data=read_data) sequence_encoding = list() scale_encoding = list() shape_encoding = list() modes_encoding = list() reversal_encoding = list() # No reads here? if len(aligned_sequences) == 0: continue try: # print("REF\t", "".join(aligned_ref_sequence)) for read_id in aligned_sequences.keys(): # print("READ\t%s\t%s" % (read_id,"".join(aligned_sequences[read_id]))) sequence_encoding.append( list(map(get_encoding, aligned_sequences[read_id]))) scale_encoding.append(aligned_scales[read_id]) shape_encoding.append(aligned_shapes[read_id]) modes_encoding.append( list( map( map_parameters_to_mode, zip(aligned_scales[read_id], aligned_shapes[read_id])))) reversal_encoding.append(reversal_statuses[read_id]) ref_sequence_encoding = [ list(map(get_encoding, aligned_ref_sequence)) ] ref_lengths_encoding = [aligned_ref_lengths] ref_sequence_encoding = numpy.atleast_2d( numpy.array(ref_sequence_encoding, dtype=numpy.int)) ref_length_encoding = numpy.atleast_2d( numpy.array(ref_lengths_encoding, dtype=numpy.int)) sequence_encoding = numpy.atleast_2d( numpy.array(sequence_encoding, dtype=numpy.int)) scale_encoding = numpy.atleast_2d( numpy.array(scale_encoding, dtype=numpy.float)) shape_encoding = numpy.atleast_2d( numpy.array(shape_encoding, dtype=numpy.float)) modes_encoding = numpy.atleast_2d( numpy.array(modes_encoding, dtype=numpy.int)) reversal_encoding = numpy.array(reversal_encoding, dtype=numpy.bool) consensus_sequence, consensus_lengths = \ get_consensus_from_modal_pileup_encoding(length_classifier=length_classifier, sequence_encoding=sequence_encoding, length_encoding=modes_encoding, reversal_encoding=reversal_encoding) weibull_consensus_sequence, weibull_consensus_lengths = \ get_consensus_from_weibull_pileup_encoding(length_classifier=length_classifier_weibull, sequence_encoding=sequence_encoding, scale_encoding=scale_encoding, shape_encoding=shape_encoding, reversal_encoding=reversal_encoding) plot_runlength_pileup( sequences=-sequence_encoding, scales=scale_encoding, shapes=shape_encoding, modes=modes_encoding, ref_sequence=-ref_sequence_encoding, ref_lengths=ref_length_encoding, predicted_sequence=-numpy.atleast_2d( numpy.array(weibull_consensus_sequence, dtype=numpy.int)), predicted_lengths=numpy.atleast_2d( numpy.array(weibull_consensus_lengths, dtype=numpy.int))) print() print("PREDICTED\t", weibull_consensus_lengths[:10]) print("TRUE\t\t", aligned_ref_lengths[:10]) confusion = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=consensus_lengths, max_length=10) confusion_weibull = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=weibull_consensus_lengths, max_length=10) total_confusion += confusion total_confusion_weibull += confusion_weibull except Exception as e: print(e) continue print() accuracy = get_accuracy_from_confusion_matrix(total_confusion) print("Modal: ", accuracy) accuracy = get_accuracy_from_confusion_matrix(total_confusion_weibull) print("Full: ", accuracy) plot_filename = "confusion.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_confusion)) pyplot.show() figure.savefig(plot_path) pyplot.close() plot_filename = "confusion_weibull.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_confusion_weibull)) pyplot.show() figure.savefig(plot_path) pyplot.close()
def main(): # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_ref.fasta" # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_8_0_341509_reads.fasta" # matrix_path = "/home/ryan/code/runnie_parser/output/runlength_matrix_from_assembly_contigs_2019_3_19_13_29_14_657613/probability_matrices_2019_3_19_13_29_19_362916.csv" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" read_fasta_path = "/home/ryan/code/runlength_analysis/data/sequence_subset_ecoli_guppy-runnie_60x_test.fastq" matrix_path = "/home/ryan/code/runlength_analysis/output/runlength_matrix_from_sequence_2019_4_5_15_29_28_403950/probability_matrices_2019_4_5_15_35_57_920301.csv" output_parent_dir = "output/" output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) read_fasta_filename_prefix = ".".join( os.path.basename(read_fasta_path).split(".")[:-1]) runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta" runlength_read_fasta_path = os.path.join(output_dir, runlength_read_fasta_filename) runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) runlength_read_sequences = runlength_encode_fasta( fasta_sequence_path=read_fasta_path) read_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_read_fasta_path, runlength_read_sequences=runlength_read_sequences, output_dir=output_dir) bam_handler = BamHandler(read_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) windows = chunk_chromosome_coordinates(chromosome_length=chromosome_length, chunk_size=1000) # Initialize empty confusion matrices total_confusion = get_runlength_confusion([], [], 10) total_modal_confusion = get_runlength_confusion([], [], 10) length_classifier = RunlengthClassifier(matrix_path) print("reading BAM") for pileup_start, pileup_end in windows[:10]: print("window", pileup_start, pileup_end) sys.stderr.write("\r%s" % pileup_start) aligned_ref_sequence, aligned_ref_lengths, aligned_sequences, aligned_lengths, reversal_statuses = \ get_aligned_segments(fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=pileup_start, pileup_end=pileup_end, runlength_ref_sequences=runlength_ref_sequences, read_data=runlength_read_sequences) sequence_encoding = list() length_encoding = list() reversal_encoding = list() # No reads here? if len(aligned_sequences) == 0: continue # print("REF\t", "".join(aligned_ref_sequence)) for read_id in aligned_sequences.keys(): # print("READ\t","".join(aligned_sequences[read_id])) sequence_encoding.append( list(map(get_encoding, aligned_sequences[read_id]))) length_encoding.append(aligned_lengths[read_id]) reversal_encoding.append(reversal_statuses[read_id]) ref_sequence_encoding = [list(map(get_encoding, aligned_ref_sequence))] ref_lengths_encoding = [aligned_ref_lengths] ref_sequence_encoding = numpy.array(ref_sequence_encoding, dtype=numpy.int) ref_length_encoding = numpy.array(ref_lengths_encoding, dtype=numpy.int) sequence_encoding = numpy.array(sequence_encoding, dtype=numpy.int) length_encoding = numpy.array(length_encoding, dtype=numpy.float) reversal_encoding = numpy.array(reversal_encoding, dtype=numpy.bool) ref_sequence_encoding = numpy.atleast_2d(ref_sequence_encoding) ref_length_encoding = numpy.atleast_2d(ref_length_encoding) sequence_encoding = numpy.atleast_2d(sequence_encoding) length_encoding = numpy.atleast_2d(length_encoding) # plot_runlength_pileup(sequences=-sequence_encoding, # lengths=length_encoding, # ref_sequence=-ref_sequence_encoding, # ref_lengths=ref_length_encoding) consensus_sequence, consensus_lengths = \ get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier, sequence_encoding=sequence_encoding, length_encoding=length_encoding, reversal_encoding=reversal_encoding) modal_consensus_sequence, modal_consensus_lengths = \ get_consensus_from_runlength_pileup_encoding(length_classifier=length_classifier, sequence_encoding=sequence_encoding, length_encoding=length_encoding, reversal_encoding=reversal_encoding, bayesian=False) print() print("PREDICTED\t", consensus_lengths[:10]) print("TRUE\t\t", aligned_ref_lengths[:10]) confusion = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=consensus_lengths, max_length=10) total_confusion += confusion modal_confusion = get_runlength_confusion( true_lengths=aligned_ref_lengths, predicted_lengths=modal_consensus_lengths, max_length=10) total_modal_confusion += modal_confusion # except Exception as e: # print(e) # continue print() accuracy = get_accuracy_from_confusion_matrix(total_confusion) print("Bayes:", accuracy) accuracy = get_accuracy_from_confusion_matrix(total_modal_confusion) print("No Bayes", accuracy) plot_filename = "confusion.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_confusion)) pyplot.show() figure.savefig(plot_path) pyplot.close() plot_filename = "modal_confusion.png" plot_path = os.path.join(output_dir, plot_filename) figure = pyplot.figure() axes = pyplot.axes() axes.set_xlabel("Predicted") axes.set_ylabel("True") pyplot.imshow(numpy.log10(total_modal_confusion)) pyplot.show() figure.savefig(plot_path) pyplot.close()
def generate_ngx_plot(assembly_contigs, input_dir, genome_size=None, y_max=180, title="NGx", figure=None, axes=None): samples = [ "03492", "03098", "02723", "02080", "02055", "01243", "01109", "00733", "24385", "24149", "24143", "CHM13", "hg38_no_alts" ] colors = [ (175 / 256.0, 48 / 256.0, 51 / 256.0), # red (224 / 256.0, 99 / 256.0, 58 / 256.0), # orange (215 / 256.0, 219 / 256.0, 84 / 256.0), # yellow (110 / 256.0, 170 / 256.0, 100 / 256.0), # light green (80 / 256.0, 180 / 256.0, 150 / 256.0), # green (100 / 256.0, 189 / 256.0, 197 / 256.0), # green-blue (0 / 256.0, 170 / 256.0, 231 / 256.0), # turquoise (51 / 256.0, 87 / 256.0, 182 / 256.0), # blue (37 / 256.0, 36 / 256.0, 93 / 256.0), # indigo (95 / 256.0, 51 / 256.0, 139 / 256.0), # purple (200 / 256.0, 53 / 256.0, 93 / 256.0), # pink (224 / 256.0, 99 / 256.0, 58 / 256.0), (110 / 256.0, 170 / 256.0, 100 / 256.0) ] alphas = [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 1.0, 0.3, 0.3, 0.3, 1.0, 1.0] zorders = [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1] labels = {} # --------------------------------------------------------------------------- # samples = ["shasta", "wtdbg2", "canu", "flye"] # # colors = [(0.890,0.120,0.031), # (0.999,0.696,0.031), # (112/256, 37/256, 163/256) # (0.039,0.463,0.58), # (0.024,0.69,0.224)] # # zorders = [1,0,0,0] # alphas = [1,0.9,1,1] # # labels = {} # --------------------------------------------------------------------------- # # samples = ["shasta", "hifi"] # # colors = [(0.933,0.153,0.031), # (112/256, 37/256, 163/256), # (0.039,0.463,0.58), # (0.024,0.69,0.224)] # # zorders = [1,1] # alphas = [1,1] # # labels = {} # --------------------------------------------------------------------------- # samples = ["assembly_GM24385", # "assembly_HG00733", # "scaffold_GM24385", # "scaffold_HG00733"] # # labels = {} # # colors = [(51/256.0, 87/256.0, 182/256.0), # blue # (51/256.0, 87/256.0, 182/256.0), # green-blue # # (200/256.0, 200/256.0, 200/256.0), # grey # (100/256.0, 189/256.0, 197/256.0), # orange # (100/256.0, 189/256.0, 197/256.0)] # light green # # zorders = [1,1,1,1] # alphas = [0.5,1,0.5,1] # --------------------------------------------------------------------------- if genome_size is None: print("WARNING: genome_size unspecified, using human as default") genome_size = 3.23 * 1000**3 if y_max is None: print("WARNING: y_max unspecified, using 180Mbp as default") y_max = 180 if figure is None and axes is None: figure = pyplot.figure() axes = pyplot.axes() legend_names = list() for path, contigs in sorted(assembly_contigs.items(), key=lambda x: x[0]): print("Plotting assembly: %s" % path) sample_matched = False for name in samples: if name.lower() in path.lower(): sample_index = samples.index(name) color = colors[sample_index] alpha = alphas[sample_index] zorder = zorders[sample_index] sample_name = name sample_matched = True if not sample_matched: print("ERROR: color not found for %s" % path) sample_index = 0 color = colors[sample_index] alpha = alphas[sample_index] zorder = zorders[sample_index] sample_name = os.path.basename(path).split(".")[0] if sample_name in labels: label = labels[sample_name] else: label = sample_name x1 = 0 y_prev = None x_coords = list() y_coords = list() for contig in contigs: y = contig[LENGTH] width = contig[LENGTH] / genome_size x2 = x1 + width if y_prev is not None: x_coords.extend([x1, x1]) y_coords.extend([y_prev, y]) x_coords.extend([x1, x2]) y_coords.extend([y, y]) x1 = x2 y_prev = y if y_coords[-1] != 0: y_coords.append(0) x_coords.append(x_coords[-1]) dashes = [1, 0, 1, 0] if "hifi" in path.lower(): label = "Canu CCS" if "shasta" in path: label = "Shasta Nanopore" if label not in legend_names: legend_names.append(label) axes.plot(x_coords, y_coords, color=color, alpha=alpha, zorder=zorder, dashes=dashes, linewidth=0.6) axes.legend(legend_names) axes.axvline(0.5, linestyle="--", alpha=0.3, linewidth=0.7, zorder=-1) # max_size = y_max # # step_size = 20 # if step_size >= y_max: # step_size = 1 # # scale = 1_000_000 # # axes.set_xlim([0,1]) # axes.set_ylim([0,max_size*scale]) # axes.set_yticks(numpy.arange(0,max_size+step_size,step_size)*scale) # axes.set_yticklabels(numpy.arange(0,max_size+step_size,step_size)) axes.set_title(title) axes.set_ylabel("Contig/scaffold size (Mbp)") axes.set_xlabel("Cumulative coverage") FileManager.ensure_directory_exists("output") output_dir = "output/" filename = input_dir.rstrip("/").split( "/")[-1] + "_" + FileManager.get_datetime_string() file_path = os.path.abspath(os.path.join(output_dir, filename)) print("SAVING FIGURE: %s" % file_path) figure.savefig(file_path + ".png", dpi=300) figure.savefig(file_path + ".pdf", dpi=300) pyplot.close()
def save_directional_frequency_matrices_as_delimited_text( output_dir, frequency_matrices, chromosome_name=None, delimiter=",", log_normalize=False, plot=False, pseudocount=1e-12, diagonal_bias=0, default_type=int): if chromosome_name is not None: name_suffix = chromosome_name + "_" else: name_suffix = "" if log_normalize: filename = "probability_matrices_directional_" + name_suffix + FileManager.get_datetime_string( ) + ".csv" else: filename = "frequency_matrices_directional_" + name_suffix + FileManager.get_datetime_string( ) + ".csv" reversal_suffixes = ["F", "R"] output_path = os.path.join(output_dir, filename) file = open(output_path, "w") print("SAVING: %s" % output_path) for reversal in [0, 1]: for base_index in range(4): base = INDEX_TO_BASE[base_index] suffix = reversal_suffixes[reversal] matrix = numpy.squeeze(frequency_matrices[reversal, base_index, :, :]) type = default_type if log_normalize: matrix = normalize(matrix, pseudocount=pseudocount, diagonal_bias=diagonal_bias) type = float if plot: pyplot.imshow(matrix) pyplot.show() pyplot.close() matrix_name = "_".join([base, suffix]) header = ">" + matrix_name + " likelihood\n" # print(type) file.write(header) for r in range(matrix.shape[0]): row = [str(type(x)) for x in matrix[r]] # if r < 4 and not log_normalize: # print(row) row = delimiter.join(row) + "\n" file.write(row) file.write("\n") file.close()
def save_nondirectional_frequency_matrices_as_delimited_text( output_dir, frequency_matrices, chromosome_name=None, delimiter=",", log_normalize=False, pseudocount=1e-12, diagonal_bias=0, plot=False, default_type=int, filename=None): if filename is None: if chromosome_name is not None: name_suffix = chromosome_name + "_" else: name_suffix = "" if log_normalize: filename = "probability_matrices_" + name_suffix + FileManager.get_datetime_string( ) + ".csv" else: filename = "frequency_matrices_" + name_suffix + FileManager.get_datetime_string( ) + ".csv" output_path = os.path.join(output_dir, filename) file = open(output_path, "w") for base_index in range(4): base = INDEX_TO_BASE[base_index] matrix = numpy.squeeze(frequency_matrices[base_index, :, :]) type = default_type if log_normalize: matrix = normalize(matrix, pseudocount=pseudocount, diagonal_bias=diagonal_bias) type = float if plot: pyplot.imshow(matrix) pyplot.show() pyplot.close() matrix_name = base if log_normalize: matrix_name += " likelihood" header = ">" + matrix_name + "\n" file.write(header) for r in range(matrix.shape[0]): row = [str(type(x)) for x in matrix[r]] row = delimiter.join(row) + "\n" file.write(row) file.write("\n") file.close()
def main(): ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" read_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/guppy/subsampled/11-29/r94_ec_rad2.30x-30kb.fasta" # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/v2/rad2_pass_runnie_0_1_10_11_12_13_v2.fa" # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_14_29_ecoli_wg_guppy_NO_BAYES/Assembly.fasta" # read_fasta_path = "/home/ryan/software/shasta/output/run_2019_3_23_15_40_ecoli_wg_guppy_BAYES/Assembly.fasta" # read_fasta_path = "/home/ryan/data/Nanopore/ecoli/runnie/rad2_pass_runnie_0_v2.fa" # ---- TEST DATA ---- # ref_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_ref.fasta" # read_fasta_path = "/home/ryan/code/runlength_analysis/data/synthetic_runlength_test_2019_3_25_13_14_17_762846_reads.fasta" # ------------------- output_parent_dir = "output/" output_dir = "runlength_matrix_from_sequence_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) read_fasta_filename_prefix = ".".join( os.path.basename(read_fasta_path).split(".")[:-1]) runlength_read_fasta_filename = read_fasta_filename_prefix + "_rle.fasta" runlength_read_fasta_path = os.path.join(output_dir, runlength_read_fasta_filename) sys.stderr.write("RL encoding fasta...\n") runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) runlength_read_sequences = runlength_encode_fasta( fasta_sequence_path=read_fasta_path) sys.stderr.write("Aligning RLE fasta...\n") read_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_read_fasta_path, runlength_read_sequences=runlength_read_sequences, output_dir=output_dir) bam_handler = BamHandler(read_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) print(chromosome_length) sequences, lengths = get_read_segments( fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=100000, pileup_end=100000 + 100, runlength_ref_sequences=runlength_ref_sequences, read_data=runlength_read_sequences) for k, key in enumerate(sequences): print(key) print(sequences[key][:10]) print(lengths[key][:10])
def run_base_frequency_matrix_generation_from_tuples(filter_mismatch=False): max_runlength = 50 # directories = ["/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_15_21_52_11_560358", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_15_21_52_12_855103", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_15_21_52_9_946240", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_15_21_52_7_713553", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_15_21_52_6_593646", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_15_21_52_8_668369"] # directories = ["/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_40_980920/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_42_138805/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_43_176010/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_44_574894/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_46_366545/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_47_822627/"] directories = ["output/joint_runlength_base_model/2018_11_12_14_23_56_638745/"] all_paths = list() for dir in directories: paths = FileManager.get_all_file_paths_by_type(parent_directory_path=dir, file_extension=".pkl") all_paths.extend(paths) print(len(all_paths)) frequency_matrices = {"A":numpy.zeros([max_runlength+1, max_runlength+1]), "G":numpy.zeros([max_runlength+1, max_runlength+1]), "T":numpy.zeros([max_runlength+1, max_runlength+1]), "C":numpy.zeros([max_runlength+1, max_runlength+1])} # include 0 as a possible runlength print("loaded paths: ", len(all_paths)) cutoff = sys.maxsize for p,path in enumerate(all_paths): with open(path, 'rb') as pickle_file: print(p) tuples = pickle.load(pickle_file) for tuple in tuples: observed_tuple = tuple[0] true_tuple = tuple[1] observed_base, observed_length = observed_tuple true_base, true_length = true_tuple observed_length = min(observed_length, max_runlength) true_length = min(true_length, max_runlength) if true_base == "-" and observed_base != "-": true_base = observed_base frequency_matrices[true_base][true_length, observed_length] += 1 # prefer [y,x] convention, and it plots correctly elif true_base == "-" and observed_base == "-": for split_base in ["A", "G", "T", "C"]: # add 0:0 counts to all bases frequency_matrices[split_base][true_length, observed_length] += 1 elif true_base != "-" and observed_base == "-": frequency_matrices[true_base][true_length, observed_length] += 1 else: frequency_matrices[true_base][true_length, observed_length] += 1 if p == cutoff: break for base in ["A", "G", "T", "C"]: print(base) print(frequency_matrices[base]) # plot_frequency_matrices(frequency_matrices) output_dir = "/home/ryan/code/nanopore_assembly/models/parameters/" filename = "runlength_frequency_matrices_per_base_" + FileManager.get_datetime_string() print("SAVING: ", output_dir+filename) save_numpy_matrices(output_dir=output_dir, filename=filename, matrices=frequency_matrices) # frequency_matrices = load_base_frequency_matrices(os.path.join(output_dir,filename+".npz")) plot_frequency_matrices(frequency_matrices)
def main(output_dir="data/"): filename_prefix = "synthetic_runnie_test_" + FileManager.get_datetime_string( ) runlength_reference_path = os.path.join(output_dir, filename_prefix + "_ref.fasta") runnie_output_path = os.path.join(output_dir, filename_prefix + "_runnie.out") modal_parameters = read_weibull_params() n_repeats = 30 coverage = 12 ref_max_runlength = 8 base_pool = ["A", "T", "G", "C"] ref_sequence = list() ref_lengths = list() ref_bases = list() read_output_lines = list() ref_sequence_name = "synthetic_ref_0" for i in range(n_repeats): ref_runlengths = { b: list(range(1, ref_max_runlength + 1)) for b in base_pool } for i in range(ref_max_runlength): bases = copy(base_pool) random.shuffle(bases) if len(ref_bases) > 0: while bases[0] == ref_bases[-1]: random.shuffle(bases) for base in bases: lengths = ref_runlengths[base] length = lengths.pop() ref_runlengths[base] = lengths ref_sequence.extend([base] * length) ref_lengths.append(length) ref_bases.append(base) ref_sequence = "".join(ref_sequence) for c in range(coverage): read_output_lines.append("# synthetic_read_%d" % c) sequence = list() scales = list() shapes = list() for i in range(len(ref_lengths)): runlength = ref_lengths[i] base = ref_bases[i] scale, shape = random.choice(modal_parameters[runlength]) sequence.append(base) scales.append(scale) shapes.append(shape) hex_scale = scale.hex() hex_shape = shape.hex() line = [base, hex_shape, hex_scale] line = list(map(str, line)) line = "\t".join(line) read_output_lines.append(line) print(line) print(ref_sequence) print("saving file:", runlength_reference_path) with open(runlength_reference_path, "w") as file: file.write(">" + ref_sequence_name + "\n") file.write(ref_sequence + "\n") print("saving file:", runnie_output_path) with open(runnie_output_path, "w") as file: for line in read_output_lines: file.write(line + "\n")
def main(): # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta" # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" runlength_path = "/home/ryan/code/runlength_analysis/output/guppy_vs_runnie_ecoli_rad2_train_test_sequences/runnie_subset_train_60x_10kb.out" output_parent_dir = "output/" output_dir = "runlength_matrix_from_assembly_contigs_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) assembly_fasta_filename_prefix = ".".join( os.path.basename(runlength_path).split(".")[:-1]) runlength_assembly_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta" runlength_assembly_fasta_path = os.path.join( output_dir, runlength_assembly_fasta_filename) handler = RunlengthHandler(runlength_path) reads = handler.iterate_file(sequence_cutoff=12) read_runlength_sequences = dict() for r, read in enumerate(reads): data = read.data read_id = read.id sequence, lengths = RunlengthHandler.convert_runnie_data_to_rle_sequence( data) # print(sequence[:10]) # print(lengths[:10]) read_runlength_sequences[read_id] = [sequence, lengths] runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) assembly_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_assembly_path=runlength_assembly_fasta_path, runlength_assembly_sequences=read_runlength_sequences, output_dir=output_dir) chromosomal_matrices = generate_runlength_frequency_matrix( runlength_ref_sequence_path=runlength_ref_fasta_path, assembly_vs_ref_bam_path=assembly_vs_ref_bam_path, runlength_ref_sequences=runlength_ref_sequences, runlength_assembly_sequences=read_runlength_sequences) for matrix in chromosomal_matrices: save_directional_frequency_matrices_as_delimited_text( output_dir=output_dir, frequency_matrices=matrix, log_normalize=False, plot=False) save_directional_frequency_matrices_as_delimited_text( output_dir=output_dir, frequency_matrices=matrix, log_normalize=True, plot=False) nondirectional_matrix = sum_complementary_matrices(matrix) save_nondirectional_frequency_matrices_as_delimited_text( output_dir=output_dir, frequency_matrices=nondirectional_matrix, log_normalize=False, plot=False) save_nondirectional_frequency_matrices_as_delimited_text( output_dir=output_dir, frequency_matrices=nondirectional_matrix, log_normalize=True, plot=False) # zero_mask = (matrix == 0) # nonzero_mask = numpy.invert(zero_mask) # matrix[zero_mask] += numpy.min(matrix[nonzero_mask]) plot_directional_residuals(matrix) plot_base_matrices(matrix, test_spot=False, normalize_matrices=False) plot_base_matrices(matrix, test_spot=False, normalize_matrices=True)
def main(): # ref_fasta_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_ref.fasta" # runlength_path = "/home/ryan/code/runnie_parser/data/synthetic_runnie_test_2019_3_18_11_56_2_830712_runnie.out" ref_fasta_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" runlength_path = "/home/ryan/data/Nanopore/ecoli/runnie/out/test/rad2_pass_runnie_4_5_6_7.out" output_parent_dir = "output/" output_dir = "runlength_matrix_from_runnie_output_" + FileManager.get_datetime_string( ) output_dir = os.path.join(output_parent_dir, output_dir) FileManager.ensure_directory_exists(output_dir) ref_fasta_filename_prefix = ".".join( os.path.basename(ref_fasta_path).split(".")[:-1]) runlength_ref_fasta_filename = ref_fasta_filename_prefix + "_rle.fasta" runlength_ref_fasta_path = os.path.join(output_dir, runlength_ref_fasta_filename) assembly_fasta_filename_prefix = ".".join( os.path.basename(runlength_path).split(".")[:-1]) runlength_read_fasta_filename = assembly_fasta_filename_prefix + "_rle.fasta" runlength_read_fasta_path = os.path.join(output_dir, runlength_read_fasta_filename) handler = RunlengthHandler(runlength_path) reads = handler.iterate_file(sequence_cutoff=sys.maxsize, print_status=True) read_data = dict() for r, read in enumerate(reads): read_data[read.id] = read print("\nRLE encoding reference sequence...") runlength_ref_sequences = runlength_encode_fasta( fasta_sequence_path=ref_fasta_path) assembly_vs_ref_bam_path = align_as_RLE( runlength_reference_path=runlength_ref_fasta_path, runlength_ref_sequences=runlength_ref_sequences, runlength_read_path=runlength_read_fasta_path, runlength_read_sequences=read_data, output_dir=output_dir) bam_handler = BamHandler(assembly_vs_ref_bam_path) fasta_handler = FastaHandler(runlength_ref_fasta_path) contig_names = fasta_handler.get_contig_names() chromosome_name = contig_names[0] chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) sequences, scales, shapes = get_read_segments( fasta_handler=fasta_handler, bam_handler=bam_handler, chromosome_name=chromosome_name, pileup_start=100000, pileup_end=100000 + 100, runlength_ref_sequences=runlength_ref_sequences, read_data=read_data) for k, key in enumerate(sequences): print(key) print(sequences[key][:10]) print(scales[key][:10]) print(shapes[key][:10])
def main(output_dir="data/"): filename_prefix = "synthetic_runlength_test_" + FileManager.get_datetime_string() runlength_reference_path = os.path.join(output_dir, filename_prefix + "_ref.fasta") runlength_reads_path = os.path.join(output_dir, filename_prefix + "_reads.fasta") reverse_complement = True n_repeats = 12 coverage = 12 ref_max_runlength = 8 base_pool = ["A", "T", "G", "C"] base_length_offsets = {"A":0, "T":1, "G":2, "C":3} ref_sequence = list() ref_lengths = list() ref_bases = list() read_output_lines = list() ref_sequence_name = "synthetic_ref_0" for i in range(n_repeats): ref_runlengths = {b: list(range(1, ref_max_runlength + 1)) for b in base_pool} for i in range(ref_max_runlength): bases = copy(base_pool) random.shuffle(bases) if len(ref_bases) > 0: while bases[0] == ref_bases[-1]: random.shuffle(bases) for base in bases: lengths = ref_runlengths[base] length = lengths.pop() ref_runlengths[base] = lengths ref_sequence.extend([base]*length) ref_lengths.append(length) ref_bases.append(base) ref_sequence = "".join(ref_sequence) for c in range(coverage): read_output_lines.append(">synthetic_read_%d"%c) sequence = list() for i in range(len(ref_lengths)): base = ref_bases[i] runlength = ref_lengths[i] + base_length_offsets[base] sequence.extend([base]*runlength) sequence = "".join(sequence) read_output_lines.append(sequence) if reverse_complement: read_output_lines.append(">synthetic_read_reverse_%d" % c) sequence = complement_sequence(sequence=sequence, reverse=True) sequence = "".join(sequence) read_output_lines.append(sequence) print("saving file:", runlength_reference_path) with open(runlength_reference_path, "w") as file: file.write(">"+ref_sequence_name+"\n") file.write(ref_sequence + "\n") print("saving file:", runlength_reads_path) with open(runlength_reads_path, "w") as file: for line in read_output_lines: file.write(line + "\n")
def main(): # output_root_dir = "output/" # instance_dir = "spoa_pileup_generation_" + get_current_timestamp() # output_dir = os.path.join(output_root_dir, instance_dir) # ---- Nanopore - GUPPY HUMAN - (dev machine) ----------------------------- # bam_file_path = "/home/ryan/data/Nanopore/Human/BAM/Guppy/rel5-guppy-0.3.0-chunk10k.sorted.bam" # reference_file_path = "/home/ryan/data/GIAB/GRCh38_WG.fa" # vcf_path = "/home/ryan/data/GIAB/NA12878_GRCh38_PG.vcf.gz" # bed_path = "/home/ryan/data/GIAB/NA12878_GRCh38_confident.bed" # ---- Nanopore GUPPY - C ELEGANS - (dev machine) ------------------------- # bam_file_path = "/home/ryan/data/Nanopore/celegans/all_chips_20k_Boreal_minimap2.sorted.filtered2820.bam" # reference_file_path = "/home/ryan/data/Nanopore/celegans/GCF_000002985.6_WBcel235_genomic.fasta" # ---- Nanopore GUPPY - E. Coli - (dev machine) ------------------------- bam_file_path = "/home/ryan/data/Nanopore/ecoli/miten/r9_ecoli_reads_vs_ref.bam" reference_file_path = "/home/ryan/data/Nanopore/ecoli/miten/refEcoli.fasta" # ------------------------------------------------------------------------- fasta_handler = FastaHandler(reference_file_path) contig_names = fasta_handler.get_contig_names() fasta_handler.close() # chromosome_name = "NC_003279.8" # celegans chr1 # chromosome_name = "NC_003283.11" # celegans chr5 for chromosome_name in contig_names: if chromosome_name == "NC_001328.1": # mitochondrial continue print("STARTING:", chromosome_name) fasta_handler = FastaHandler(reference_file_path) chromosome_length = fasta_handler.get_chr_sequence_length(chromosome_name) reference_sequence = fasta_handler.get_sequence(chromosome_name=chromosome_name, start=0, stop=chromosome_length) fasta_handler.close() region = [0+1000000, chromosome_length-1000000] max_threads = 30 window_size = 10000 min_size = 20 max_size = 80 manager = multiprocessing.Manager() counter = manager.Value('i', 0) region_windows = chunk_region(region=region, size=window_size) n_chunks = len(region_windows) print("subregions: ", n_chunks) output_dir = "output/window_selection/" + str(chromosome_name) + "_" + str(region[0]) + "_" + str(region[1]) + "_" + FileManager.get_datetime_string() print(output_dir) # args = list() # for subregion in region_windows: # args.append([bam_file_path, chromosome_name, subregion, reference_sequence, min_size, max_size, output_dir, counter, n_chunks]) pooled_args = generate_argument_pools(pool_size=max_threads, bam_file_path=bam_file_path, chromosome_name=chromosome_name, region_windows=region_windows, reference_sequence=reference_sequence, min_size=min_size, max_size=max_size, output_dir=output_dir, counter=counter, n_chunks=n_chunks) # print(len(pooled_args)) # s = 0 # for pool in pooled_args: # s += len(pool) # print(len(pool)) # print(len(region_windows)) # print(s) # exit() for arg_pool in pooled_args: # initiate threading gc.collect() with Pool(processes=max_threads) as pool: pool.starmap(select_windows, arg_pool) print()