def setUpClass(cls): super(EventDetectTests, cls).setUpClass() cls.HOME = '/'.join(os.path.abspath(__file__).split("/")[:-4]) cls.dna_file = os.path.join(cls.HOME, "tests/minion_test_reads/canonical_ecoli_R9/miten_PC_20160820_FNFAD20259_MN17223_mux_scan_AMS_158_R9_WGA_Ecoli_08_20_16_83098_ch138_read23_strand.fast5") cls.rna_file = os.path.join(cls.HOME, "tests/minion_test_reads/RNA_edge_cases/DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_61_ch_151_strand.fast5") dna_handle = Fast5(cls.dna_file, 'r+') rna_handle = Fast5(cls.rna_file, 'r+') cls.dna_handle = dna_handle.create_copy("test_dna.fast5") cls.rna_handle = rna_handle.create_copy("test_rna.fast5")
def test_rna_reads(self): with tempfile.TemporaryDirectory() as tempdir: template_model = os.path.join( self.HOME, "models/testModelR9p4_5mer_acgt_RNA.model") args = create_signalAlignment_args( alignment_file=self.rna_bam, bwa_reference=self.rna_reference, forward_reference=self.rna_reference, in_templateHmm=template_model, path_to_bin=self.path_to_bin, destination=tempdir, embed=True, delete_tmp=False) in_rna_file = os.path.join( self.test_dir_rna, "DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_36_ch_218_strand.fast5" ) final_args = merge_dicts([args, dict(in_fast5=in_rna_file)]) handle = SignalAlignment(**final_args) handle.run() fh = pysam.FastaFile(self.rna_reference) f5fh = Fast5(in_rna_file) sa_events = f5fh.get_signalalign_events() for i, event in enumerate(sa_events): kmer = fh.fetch(reference="rna_fake", start=event["reference_index"], end=event["reference_index"] + 5)[::-1] self.assertEqual(event["path_kmer"].decode(), kmer) self.assertEqual(event["reference_kmer"].decode(), kmer) in_rna_file = os.path.join( self.test_dir_rna, "DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_61_ch_151_strand.fast5" ) final_args = merge_dicts([args, dict(in_fast5=in_rna_file)]) handle = SignalAlignment(**final_args) handle.run() rev_c = ReverseComplement() f5fh = Fast5(in_rna_file) sa_events = f5fh.get_signalalign_events() for i, event in enumerate(sa_events): kmer = fh.fetch(reference="rna_fake", start=event["reference_index"], end=event["reference_index"] + 5)[::-1] rev_kmer = rev_c.reverse_complement(kmer) self.assertEqual(event["path_kmer"].decode(), rev_kmer) self.assertEqual(event["reference_kmer"].decode(), kmer)
def create_minknow_events_from_fast5(fast5_path, window_lengths=(3, 6), thresholds=(1.4, 9.0), peak_height=0.2): """Create events with ('start', 'length', 'mean', 'stdv', 'model_state', 'move', 'p_model_state') fields from fast5 file. The 'model_state', 'move' and 'p_model_state' are all empty :param fast5_path: path to fast5 file :param window_lengths: Length 2 list of window lengths across raw data from which `t_stats` are derived :param thresholds: Length 2 list of thresholds on t-statistics :param peak_height: Absolute height a peak in signal must rise below previous and following minima to be considered relevant """ assert os.path.isfile(fast5_path), "File does not exist: {}".format( fast5_path) f5fh = Fast5(fast5_path, read='r+') signal = f5fh.get_read(raw=True, scale=True) # read_id = bytes.decode(f5fh.raw_attributes['read_id']) sampling_freq = f5fh.sample_rate start_time = f5fh.raw_attributes['start_time'] # event_table = create_minknow_event_table(signal, sampling_freq, start_time, window_lengths=window_lengths, thresholds=thresholds, peak_height=peak_height) return event_table, f5fh
def setUpClass(cls): super(Fast5Test, cls).setUpClass() cls.HOME = '/'.join(os.path.abspath(__file__).split("/")[:-4]) fast5_file = os.path.join( cls.HOME, "tests/minion_test_reads/canonical_ecoli_R9/miten_PC_20160820_FNFAD20259_MN17223_mux_scan_AMS_158_R9_WGA_Ecoli_08_20_16_83098_ch138_read23_strand.fast5" ) fast5handle = Fast5(fast5_file, 'r+') cls.fast5handle = fast5handle.create_copy("test.fast5")
def remove_sa_analyses(fast5): """Remove signalalign analyses from a fast5 file""" assert os.path.exists(fast5), "Fast5 path does not exist".format(fast5) fh = Fast5(fast5, read='r+') counter = 0 for analyses in [x for x in list(fh["Analyses"].keys()) if "SignalAlign" in x]: fh.delete(os.path.join("Analyses", analyses)) counter += 1 fh.close() return counter
def run_alignment_comparison(self, src_file_path, dna=True): # copy file to tmp directory file_name = os.path.basename(src_file_path) file_path = os.path.join(self.tmp_directory, file_name) shutil.copy(src_file_path, file_path) # pre-alignment work with closing(Fast5(file_path, read='r+')) as fast5_handle: nuc_sequence = fast5_handle.get_fastq( analysis="Basecall_1D", section="template").split()[2] dest = fast5_handle.get_analysis_events_path_new( self.UNIT_TEST_NAME) #todo verify we don't need this # fast5_handle.ensure_path(dest) # run kmeralign model_file = self.dna_template_model_file if dna else self.rna_model_file status = self.run_kmeralign_exe(file_path, nuc_sequence, model_file, dest) self.assertEqual(status, 0, "error aligning file {}".format(file_name)) with closing(Fast5(file_path, read='r')) as fast5_handle: new_events = fast5_handle.get_custom_analysis_events( self.UNIT_TEST_NAME) if dna: old_events = fast5_handle.get_custom_analysis_events( fast5_handle.__default_basecall_1d_analysis__) else: old_events = fast5_handle.get_resegment_basecall() # og_events = fast5_handle.get_custom_analysis_events(fast5_handle.__default_basecall_1d_analysis__) print("\nComparing {} events from {}".format( "DNA" if dna else "RNA", file_name)) start_diff_avg, mean_diff_avg, std_diff_avg = self.compare_event_alignment( old_events, new_events) self.assertLess(mean_diff_avg, 1.0, "Aligned event means are too varied") self.assertLess(std_diff_avg, 1.0, "Aligned event stds are too varied") self.assertLess(start_diff_avg, 0.001, "Aligned event start times are too varied")
def open(self): if self.is_open: return True try: self.fastFive = Fast5(self.filename, 'r+') self.is_open = True return True except Exception as e: self.close() self.logError( "[NanoporeRead:open] ERROR opening {filename}, {e}".format( filename=self.filename, e=e)) return False
def test_embed_with_both(self): signal_file_reads = os.path.join(self.HOME, "tests/minion_test_reads/pUC/") template_model = os.path.join( self.HOME, "models/testModelR9_5mer_acegt_template.model") complement_model = os.path.join( self.HOME, "models/testModelR9_5mer_acegt_complement.model") puc_reference = os.path.join(self.HOME, "tests/test_sequences/pUC19_SspI.fa") signal_file_guide_alignment = os.path.join( self.HOME, "tests/minion_test_reads/pUC/puc.bam") with tempfile.TemporaryDirectory() as tempdir: new_dir = os.path.join(tempdir, "new_dir") if os.path.exists(new_dir): shutil.rmtree(new_dir) working_folder = FolderHandler() working_folder.open_folder(os.path.join(tempdir, "test_dir")) shutil.copytree(signal_file_reads, new_dir) args = create_signalAlignment_args( alignment_file=signal_file_guide_alignment, bwa_reference=puc_reference, forward_reference=puc_reference, in_templateHmm=template_model, path_to_bin=self.path_to_bin, destination=working_folder.path, embed=True, output_format="both", filter_reads=0, twoD_chemistry=True, in_complementHmm=complement_model, delete_tmp=True) final_args = merge_dicts([ args, dict(in_fast5=os.path.join( new_dir, "makeson_PC_20160807_FNFAD20242_MN17284_sequencing_run_MA_470_R9_pUC_g_PCR_BC_08_07_16_93165_ch1_read176_strand.fast5" )) ]) handle = SignalAlignment(**final_args) handle.run() f5fh = Fast5( os.path.join( new_dir, "makeson_PC_20160807_FNFAD20242_MN17284_sequencing_run_MA_470_R9_pUC_g_PCR_BC_08_07_16_93165_ch1_read176_strand.fast5" )) mea = f5fh.get_signalalign_events(mea=True) sam = f5fh.get_signalalign_events(sam=True) self.assertEqual(mea[0]["raw_start"], 2879) self.assertEqual(sam[0], "0") self.assertEqual(len(os.listdir(working_folder.path)), 2)
def setUpClass(cls): super(BandedAlignmentTests, cls).setUpClass() cls.HOME = '/'.join(os.path.abspath(__file__).split("/")[:-4]) cls.rna_model_file = os.path.join( cls.HOME, "models/testModelR9p4_5mer_acgt_RNA.model") cls.dna_template_model_file = os.path.join( cls.HOME, "models/testModelR9p4_5mer_acegt_template.model") cls.rna_model = HmmModel(model_file=cls.rna_model_file) cls.dna_model = HmmModel(model_file=cls.dna_template_model_file) cls.dna_fast5_path = os.path.join( cls.HOME, "tests/minion_test_reads/1D/LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch112_read108_strand.fast5" ) cls.dna_handle = Fast5(cls.dna_fast5_path)
def resegment_reads(fast5_path, params, speedy=False, overwrite=False): """Re-segment and create anchor alignment from previously base-called fast5 file :param fast5_path: path to fast5 file :param params: event detection parameters :param speedy: boolean option for speedyStatSplit or minknow :param overwrite: overwrite a previous event re-segmented event table :param name: name of key where events table will be placed (Analyses/'name'/Events) :return True when completed """ assert os.path.isfile(fast5_path), "File does not exist: {}".format(fast5_path) name = "ReSegmentBasecall_00{}" # create Fast5 object f5fh = Fast5(fast5_path, read='r+') # gather previous event detection old_event_table = f5fh.get_basecall_data() # assert check_event_table_time(old_event_table), "Old event is not consistent" read_id = bytes.decode(f5fh.raw_attributes['read_id']) sampling_freq = f5fh.sample_rate start_time = f5fh.raw_attributes['start_time'] # pick event detection algorithm signal = f5fh.get_read(raw=True, scale=True) if speedy: event_table = create_speedy_event_table(signal, sampling_freq, start_time, **params) params = merge_dicts([params, {"event_detection": "speedy_stat_split"}]) else: event_table = create_minknow_event_table(signal, sampling_freq, start_time, **params) params = merge_dicts([params, {"event_detection": "minknow_event_detect"}]) keys = ["nanotensor version", "time_stamp"] values = ["0.2.0", TimeStamp().posix_date()] attributes = merge_dicts([params, dict(zip(keys, values)), f5fh.raw_attributes]) if f5fh.is_read_rna(): old_event_table = index_to_time(old_event_table, sampling_freq=sampling_freq, start_time=start_time) # set event table new_event_table = create_anchor_kmers(new_events=event_table, old_events=old_event_table) f5fh.set_new_event_table(name, new_event_table, attributes, overwrite=overwrite) # gather new sequence sequence = sequence_from_events(new_event_table) if f5fh.is_read_rna(): sequence = ReverseComplement().reverse(sequence) sequence = sequence.replace("T", "U") quality_scores = '!'*len(sequence) fastq = create_fastq_line(read_id+" :", sequence, quality_scores) # set fastq f5fh.set_fastq(name, fastq) return f5fh
def embed_eventalign_events(fast5_dir, reference, output_dir, threads=1, overwrite=False): """Call eventalign and embed events""" event_generator = get_eventalign_events(fast5_dir, reference, output_dir, threads=threads, overwrite=overwrite) attributes = None for template, complement, fast5path in event_generator: print(fast5path) print("template", template) if template or complement: handle = Fast5(fast5path, read='r+') handle.set_eventalign_table(template=template, complement=complement, meta=attributes, overwrite=True) else: print("{} did not align".format(fast5path)) return True
def mea_alignment_from_signal_align(fast5_path, events=None): """Get the maximum expected alignment from a nanopore read fast5 file which has signalalign data :param fast5_path: path to fast5 file :param events: directly pass events in via a numpy array """ if events is None: assert os.path.isfile(fast5_path) fileh = Fast5(fast5_path) events = fileh.get_signalalign_events() posterior_matrix, shortest_ref_per_event, event_matrix = get_mea_params_from_events( events) # get mea alignment mea_alignments = maximum_expected_accuracy_alignment( posterior_matrix, shortest_ref_per_event) # get raw index values from alignment data structure best_path = get_indexes_from_best_path(mea_alignments) # corrected_path = fix_path_indexes(best_path) final_event_table = get_events_from_path(event_matrix, best_path) return final_event_table
def generate_events_and_alignment( fast5_path, nucleotide_sequence, nucleotide_qualities=None, event_detection_params=None, event_detection_strategy=None, save_to_fast5=True, overwrite=False, analysis_identifier=Fast5.__default_basecall_1d_analysis__, ): assert os.path.isfile(fast5_path), "File does not exist: {}".format( fast5_path) # create Fast5 object f5fh = Fast5(fast5_path, read='r+') read_id = bytes.decode(f5fh.raw_attributes['read_id']) sampling_freq = f5fh.sample_rate start_time = f5fh.raw_attributes['start_time'] success = False # event detection prep if event_detection_strategy is None: event_detection_strategy = EVENT_DETECT_MINKNOW if event_detection_params is None: event_detection_params = get_default_event_detection_params( event_detection_strategy) # detect events if event_detection_strategy == EVENT_DETECT_SPEEDY: signal = f5fh.get_read(raw=True, scale=True) event_table = create_speedy_event_table(signal, sampling_freq, start_time, **event_detection_params) event_detection_params = merge_dicts( [event_detection_params, { "event_detection": "speedy_stat_split" }]) elif event_detection_strategy == EVENT_DETECT_MINKNOW: signal = f5fh.get_read(raw=True, scale=True) event_table = create_minknow_event_table(signal, sampling_freq, start_time, **event_detection_params) event_detection_params = merge_dicts([ event_detection_params, { "event_detection": "minknow_event_detect" } ]) elif event_detection_strategy == EVENT_DETECT_SCRAPPIE: event_table = create_scrappie_event_table(fast5_path, sampling_freq) event_detection_params = merge_dicts([ event_detection_params, { "event_detection": "scrappie_event_detect" } ]) else: raise Exception( "PROGRAMMER ERROR: unknown resegment strat {}: expected {}".format( event_detection_strategy, [ EVENT_DETECT_SPEEDY, EVENT_DETECT_MINKNOW, EVENT_DETECT_SCRAPPIE ])) # gather attributes keys = ["nanotensor version", "time_stamp"] values = ["0.2.0", TimeStamp().posix_date()] attributes = merge_dicts( [event_detection_params, dict(zip(keys, values)), f5fh.raw_attributes]) # do the alignment # todo do_alignment(events, nucleotide_sequence) # success = evaluate_success() # save to fast5 (if appropriate) saved_location = None if save_to_fast5: fastq = create_fastq_line( read_id, nucleotide_sequence, "*" if nucleotide_qualities is None else nucleotide_qualities) saved_location = save_event_table_and_fastq( f5fh, event_table, fastq, attributes=attributes, overwrite=overwrite, analysis_identifier=analysis_identifier) # close f5fh.close() return success, event_table, saved_location
def resegment_reads(fast5_path, params=None, speedy=False, overwrite=True, analysis_path="ReSegmentBasecall_000"): """Re-segment and create anchor alignment from previously base-called fast5 file :param fast5_path: path to fast5 file :param params: event detection parameters :param speedy: boolean option for speedyStatSplit or minknow :param overwrite: overwrite a previous event re-segmented event table :param analysis_path: name of key where events table will be placed (Analyses/'name'/Events) :return True when completed """ assert os.path.isfile(fast5_path), "File does not exist: {}".format( fast5_path) # create Fast5 object and sanity check f5fh = Fast5(fast5_path, read='r+') if not f5fh.has_basecall_data(): f5fh.close() return None # gather previous event detection old_event_table = f5fh.get_basecall_data() read_id = bytes.decode(f5fh.raw_attributes['read_id']) sampling_freq = f5fh.sample_rate start_time = f5fh.raw_attributes['start_time'] # get params if params is None: params = get_default_event_detection_params( EVENT_DETECT_SPEEDY if speedy else EVENT_DETECT_MINKNOW) # pick event detection algorithm signal = f5fh.get_read(raw=True, scale=True) if speedy: event_table = create_speedy_event_table(signal, sampling_freq, start_time, **params) params = merge_dicts( [params, { "event_detection": "speedy_stat_split" }]) else: event_table = create_minknow_event_table(signal, sampling_freq, start_time, **params) params = merge_dicts( [params, { "event_detection": "minknow_event_detect" }]) # metadata keys = ["nanotensor version", "time_stamp"] values = ["0.2.0", TimeStamp().posix_date()] attributes = merge_dicts( [params, dict(zip(keys, values)), f5fh.raw_attributes]) # do resegmentation if f5fh.is_read_rna(): old_event_table = index_to_time(old_event_table, sampling_freq=sampling_freq, start_time=start_time) new_event_table = create_anchor_kmers(new_events=event_table, old_events=old_event_table) # get destination in fast5 #todo find latest location? ie: save_event_table_and_fastq(..) destination = f5fh._join_path(f5fh.__base_analysis__, analysis_path) f5fh.set_event_table(destination, new_event_table, attributes, overwrite=overwrite) # gather new sequence sequence = sequence_from_events(new_event_table) if f5fh.is_read_rna(): sequence = ReverseComplement().reverse(sequence) sequence = sequence.replace("T", "U") quality_scores = '!' * len(sequence) fastq = create_fastq_line(read_id + " :", sequence, quality_scores) # set fastq f5fh.set_fastq(destination, fastq, overwrite=overwrite) return f5fh
def test_embed(self): signal_file_reads = os.path.join( self.HOME, "tests/minion_test_reads/no_event_data_1D_ecoli") template_model = os.path.join( self.HOME, "models/testModelR9p4_5mer_acegt_template.model") ecoli_reference = os.path.join( self.HOME, "tests/test_sequences/E.coli_K12.fasta") signal_file_guide_alignment = os.path.join( self.HOME, "tests/minion_test_reads/oneD_alignments.sam") with tempfile.TemporaryDirectory() as tempdir: new_dir = os.path.join(tempdir, "new_dir") working_folder = FolderHandler() working_folder.open_folder(os.path.join(tempdir, "test_dir")) shutil.copytree(signal_file_reads, new_dir) args = create_signalAlignment_args( alignment_file=signal_file_guide_alignment, bwa_reference=ecoli_reference, forward_reference=ecoli_reference, in_templateHmm=template_model, path_to_bin=self.path_to_bin, destination=working_folder.path, embed=True) final_args = merge_dicts([ args, dict(in_fast5=os.path.join( new_dir, "LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch6_read347_strand.fast5" )) ]) handle = SignalAlignment(**final_args) handle.run() f5fh = Fast5( os.path.join( new_dir, "LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch6_read347_strand.fast5" )) mea = f5fh.get_signalalign_events(mea=True) sam = f5fh.get_signalalign_events(sam=True) self.assertEqual(mea[0]["raw_start"], 153) self.assertEqual(sam[0], "9") self.assertEqual(len(os.listdir(working_folder.path)), 1) self.assertEqual( sorted(os.listdir(working_folder.path))[0], "9e4d14b1-8167-44ef-9fdb-5c29dd0763fd.sm.backward.tsv") # DNA WITH events signal_file_reads = os.path.join(self.HOME, "tests/minion_test_reads/1D") template_model = os.path.join( self.HOME, "models/testModelR9p4_5mer_acegt_template.model") ecoli_reference = os.path.join( self.HOME, "tests/test_sequences/E.coli_K12.fasta") signal_file_guide_alignment = os.path.join( self.HOME, "tests/minion_test_reads/oneD_alignments.sam") with tempfile.TemporaryDirectory() as tempdir: new_dir = os.path.join(tempdir, "new_dir") working_folder = FolderHandler() working_folder.open_folder(os.path.join(tempdir, "test_dir")) shutil.copytree(signal_file_reads, new_dir) args = create_signalAlignment_args( alignment_file=signal_file_guide_alignment, bwa_reference=ecoli_reference, forward_reference=ecoli_reference, in_templateHmm=template_model, path_to_bin=self.path_to_bin, destination=working_folder.path, embed=True) final_args = merge_dicts([ args, dict(in_fast5=os.path.join( new_dir, "LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch6_read347_strand.fast5" )) ]) handle = SignalAlignment(**final_args) handle.run() f5fh = Fast5( os.path.join( new_dir, "LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch6_read347_strand.fast5" )) mea = f5fh.get_signalalign_events(mea=True) sam = f5fh.get_signalalign_events(sam=True) self.assertEqual(mea[0]["raw_start"], 153) self.assertEqual(sam[0], "9") self.assertEqual(len(os.listdir(working_folder.path)), 1) self.assertEqual( sorted(os.listdir(working_folder.path))[0], "9e4d14b1-8167-44ef-9fdb-5c29dd0763fd.sm.backward.tsv")