def test_processReferenceFasta_positions(self): with tempfile.TemporaryDirectory() as tempdir: work_folder = FolderHandler() work_folder.open_folder(os.path.join(tempdir, "test_outdir")) forward_ref, backward_ref = processReferenceFasta( self.reference, work_folder, motifs=None, positions_file=self.ambiguity_positions_file) title, comment, seq = read_fasta(forward_ref).__next__() self.assertEqual( seq, "ABTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCCCTTTCGTCTCGCGCGTTTCGGTGATGACGGTGAAAACCTCTGACACATGCAGCTCCCGGAGACGGTCACAGCTTGTCTGTAAGCGGATGCCGGGAGCAGACAAGCCCGTCAGGGCGCGTCAGCGGGTGTTGGCGGGTGTCGGGGCTGGCTTAACTATGCGGCATCAGAGCAGATTGTACTGAGAGTGCACCATATGCGGTGTGAAATACCGCACAGATGCGTAAGGAGAAAATACCGCATCAGGCGCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGATGTGCTGCAAGGCGATTAAGTTGGGTAACGCCAGGGTTTTCCCAGTCACGACGTTGTAAAACGACGGCCAGTGAATTCGAGCTCGGTACCCGGGGATCCTCTAGAGTCGACCTGCAGGCATGCAAGCTTGGCGTAATCATGGTCATAGCTGTTTCCTGTGTGAAATTGTTATCCGCTCACAATTCCACACAACATACGAGCCGGAAGCATAAAGTGTAAAGCCTGGGGTGCCTAATGAGTGAGCTAACTCACATTAATTGCGTTGCGCTCACTGCCCGCTTTCCAGTCGGGAAACCTGTCGTGCCAGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTTTGCGTATTGGGCGCTCTTCCGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGAACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAAT" ) title, comment, seq = read_fasta(backward_ref).__next__() self.assertEqual( seq, "TBATAACTTCGTAAATAGTCCCAATAACAGAGTACTCGCCTATGTATAAACTTACATAAATCTTTTTATTTGTTTATCCCCAAGGCGCGTGTAAAGGGGCTTTTCACGGTGGACTGCAGATTCTTTGGTAATAATAGTACTGTAATTGGATATTTTTATCCGCATAGTGCTCCGGGAAAGCAGAGCGCGCAAAGCCACTACTGCCACTTTTGGAGACTGTGTACGTCGAGGGCCTCTGCCAGTGTCGAACAGACATTCGCCTACGGCCCTCGTCTGTTCGGGCAGTCCCGCGCAGTCGCCCACAACCGCCCACAGCCCCGACCGAATTGATACGCCGTAGTCTCGTCTAACATGACTCTCACGTGGTATACGCCACACTTTATGGCGTGTCTACGCATTCCTCTTTTATGGCGTAGTCCGCGGTAAGCGGTAAGTCCGACGCGTTGACAACCCTTCCCGCTAGCCACGCCCGGAGAAGCGATAATGCGGTCGACCGCTTTCCCCCTACACGACGTTCCGCTAATTCAACCCATTGCGGTCCCAAAAGGGTCAGTGCTGCAACATTTTGCTGCCGGTCACTTAAGCTCGAGCCATGGGCCCCTAGGAGATCTCAGCTGGACGTCCGTACGTTCGAACCGCATTAGTACCAGTATCGACAAAGGACACACTTTAACAATAGGCGAGTGTTAAGGTGTGTTGTATGCTCGGCCTTCGTATTTCACATTTCGGACCCCACGGATTACTCACTCGATTGAGTGTAATTAACGCAACGCGAGTGACGGGCGAAAGGTCAGCCCTTTGGACAGCACGGTCGACGTAATTACTTAGCCGGTTGCGCGCCCCTCTCCGCCAAACGCATAACCCGCGAGAAGGCGAAGGAGCGAGTGACTGAGCGACGCGAGCCAGCAAGCCGACGCCGCTCGCCATAGTCGAGTGAGTTTCCGCCATTATGCCAATAGGTGTCTTAGTCCCCTATTGCGTCCTTTCTTGTACACTCGTTTTCCGGTCGTTTTCCGGTCCTTGGCATTTTTCCGGCGCAACGACCGCAAAAAGGTATCCGAGGCGGGGGGACTGCTCGTAGTGTTTTTAGCTGCGAGTTCAGTCTCCACCGCTTTGGGCTGTCCTGATATTTCTATGGTCCGCAAAGGGGGACCTTCGAGGGAGCACGCGAGAGGACAAGGCTGGGACGGCGAATGGCCTATGGACAGGCGGAAAGAGGGAAGCCCTTCGCACCGCGAAAGAGTATCGAGTGCGACATCCATAGAGTCAAGCCACATCCAGCAAGCGAGGTTCGACCCGACACACGTGCTTGGGGGGCAAGTCGGGCTGGCGACGCGGAATAGGCCATTGATAGCAGAACTCAGGTTGGGCCATTCTGTGCTGAATAGCGGTGACCGTCGTCGGTGACCATTGTCCTAATCGTCTCGCTCCATACATCCGCCACGATGTCTCAAGAACTTCACCACCGGATTGATGCCGATGTGATCTTCTTGTCATAAACCATAGACGCGAGACGACTTCGGTCAATGGAAGCCTTTTTCTCAACCATCGAGAACTAGGCCGTTTGTTTGGTGGCGACCATCGCCACCAAAAAAACAAACGTTCGTCGTCTAATGCGCGTCTTTTTTTCCTAGAGTTCTTCTAGGAAACTAGAAAAGATGCCCCAGACTGCGAGTCACCTTGCTTTTGAGTGCAATTCCCTAAAACCAGTACTCTAATAGTTTTTCCTAGAAGTGGATCTAGGAAAATTTAATTTTTACTTCAAAATTTAGTTAGATTTCATATATACTCATTTGAACCAGACTGTCAATGGTTACGAATTAGTCACTCCGTGGATAGAGTCGCTAGACAGATAAAGCAAGTAGGTATCAACGGACTGAGGGGCAGCACATCTATTGATGCTATGCCCTCCCGAATGGTAGACCGGGGTCACGACGTTACTATGGCGCTCTGGGTGCGAGTGGCCGAGGTCTAAATAGTCGTTATTTGGTCGGTCGGCCTTCCCGGCTCGCGTCTTCACCAGGACGTTGAAATAGGCGGAGGTAGGTCAGATAATTAACAACGGCCCTTCGATCTCATTCATCAAGCGGTCAATTATCAAACGCGTTGCAACAACGGTAACGATGTCCGTAGCACCACAGTGCGAGCAGCAAACCATACCGAAGTAAGTCGAGGCCAAGGGTTGCTAGTTCCGCTCAATGTACTAGGGGGTACAACACGTTTTTTCGCCAATCGAGGAAGCCAGGAGGCTAGCAACAGTCTTCATTCAACCGGCGTCACAATAGTGAGTACCAATACCGTCGTGACGTATTAAGAGAATGACAGTACGGTAGGCATTCTACGAAAAGACACTGACCACTCATGAGTTGGTTCAGTAAGACTCTTATCACATACGCCGCTGGCTCAACGAGAACGGGCCGCAGTTATGCCCTATTATGGCGCGGTGTATCGTCTTGAAATTTTCACGAGTAGTAACCTTTTGCAAGAAGCCCCGCTTTTGAGAGTTCCTAGAATGGCGACAACTCTAGGTCAAGCTACATTGGGTGAGCACGTGGGTTGACTAGAAGTCGTAGAAAATGAAAGTGGTCGCAAAGACCCACTCGTTTTTGTCCTTCCGTTTTACGGCGTTTTTTCCCTTATTCCCGCTGTGCCTTTACAACTTATGAGTATGAGAAGGAAAAAGTTA" ) forward_ref, backward_ref = processReferenceFasta( self.reference, work_folder, motifs=None, positions_file=None) title, comment, seq = read_fasta(forward_ref).__next__() self.assertEqual( seq, "ATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCCCTTTCGTCTCGCGCGTTTCGGTGATGACGGTGAAAACCTCTGACACATGCAGCTCCCGGAGACGGTCACAGCTTGTCTGTAAGCGGATGCCGGGAGCAGACAAGCCCGTCAGGGCGCGTCAGCGGGTGTTGGCGGGTGTCGGGGCTGGCTTAACTATGCGGCATCAGAGCAGATTGTACTGAGAGTGCACCATATGCGGTGTGAAATACCGCACAGATGCGTAAGGAGAAAATACCGCATCAGGCGCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGATGTGCTGCAAGGCGATTAAGTTGGGTAACGCCAGGGTTTTCCCAGTCACGACGTTGTAAAACGACGGCCAGTGAATTCGAGCTCGGTACCCGGGGATCCTCTAGAGTCGACCTGCAGGCATGCAAGCTTGGCGTAATCATGGTCATAGCTGTTTCCTGTGTGAAATTGTTATCCGCTCACAATTCCACACAACATACGAGCCGGAAGCATAAAGTGTAAAGCCTGGGGTGCCTAATGAGTGAGCTAACTCACATTAATTGCGTTGCGCTCACTGCCCGCTTTCCAGTCGGGAAACCTGTCGTGCCAGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTTTGCGTATTGGGCGCTCTTCCGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGAACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAAT" ) self.assertIsNone(backward_ref) self.assertEqual(forward_ref, self.reference) self.assertRaises(RuntimeError, processReferenceFasta, self.reference, work_folder, motifs="something", positions_file=self.ambiguity_positions_file)
def test_trim_num_files_in_sample(self): with tempfile.TemporaryDirectory() as tempdir: working_folder = FolderHandler() working_folder.open_folder(os.path.join(tempdir, "test_dir")) test_args = create_sa_sample_args( fast5_dirs=[self.fast5_dir], name="some_name", fw_reference=self.ecoli_reference) sample = SignalAlignSample(working_folder=working_folder, **test_args) n_bases = 10000 fast5_files = trim_num_files_in_sample(sample, n_bases, False, verbose=False) bases = 0 for fast5_file in fast5_files: bases += get_1d_length(fast5_file) self.assertLessEqual(bases, n_bases) fast5_files = trim_num_files_in_sample(sample, n_bases, True, verbose=False) bases = 0 for fast5_file in fast5_files: bases += get_2d_length(fast5_file) self.assertLessEqual(bases, n_bases) self.assertRaises(AssertionError, trim_num_files_in_sample, sample, 1, False, verbose=False)
def test_CreateHdpTrainingData(self): with tempfile.TemporaryDirectory() as tempdir: # create fast5 dir test_fast5 = os.path.join(tempdir, "test.fast5") copyfile(self.fast5_paths[0], test_fast5) # create fofn test_out = os.path.join(tempdir, "test.hdp.tsv") test_args = create_sa_sample_args( fast5_dirs=[tempdir], name="some_name", fw_reference=self.ecoli_reference, bwa_reference=self.ecoli_reference, number_of_kmer_assignments=1, probability_threshold=0, kmers_from_reference=False) working_folder = FolderHandler() working_folder.open_folder(os.path.join(tempdir, "test_dir")) sample = SignalAlignSample(working_folder=working_folder, **test_args) sample.analysis_files = [ self.assignment_file, self.assignment_file ] out_path = CreateHdpTrainingData( [sample], test_out, template=True, complement=False, verbose=False).write_hdp_training_file() n_lines = count_lines_in_file(out_path) self.assertEqual(n_lines, 3182) with open(out_path, 'r') as fh1, open(self.test_hdp_training_data, 'r') as fh2: self.assertEqual(sorted(list(fh1)), sorted(list(fh2)))
def test_mea_alignment_close_to_guide(self): from signalalign.validateSignalAlignment import get_all_event_summaries, ABS_SA_ALIGNMENT_DIFF, MEA from signalalign.utils.fileHandlers import FolderHandler from signalalign.signalAlignment import create_signalAlignment_args import shutil import tempfile import glob ecoli_reference = os.path.join(MeaTest.HOME, "tests/test_sequences/E.coli_K12.fasta") fast5_dir = os.path.join(MeaTest.HOME, "tests/minion_test_reads/1D") template_hmm = os.path.join(MeaTest.HOME, "models/testModelR9_acgt_template.model") path_to_bin = os.path.join(MeaTest.HOME, 'bin') threshold = 11 # make directory to put temporary files and output location output_root = tempfile.TemporaryDirectory() temp_root = FolderHandler() temp_fast5_dir = temp_root.open_folder(os.path.join(output_root.name, "temp_fast5")) temp_signal_align_dir = os.path.join(output_root.name, "temp_signalAlign") if os.path.isdir(temp_signal_align_dir): shutil.rmtree(temp_signal_align_dir) assert not os.path.isdir(temp_signal_align_dir) temp_signal_align = temp_root.open_folder(temp_signal_align_dir) # get input files orig_fast5s = glob.glob(os.path.join(fast5_dir, "*.fast5")) self.assertTrue(len(orig_fast5s) > 0, "Incorrect fast5 location: {}".format(fast5_dir)) fast5s = list() for file in orig_fast5s: dest = os.path.join(temp_fast5_dir, os.path.basename(file)) shutil.copy(file, dest) fast5s.append(dest) # get alignment args alignment_args = create_signalAlignment_args(bwa_reference=ecoli_reference, in_templateHmm=template_hmm, destination=temp_signal_align_dir, forward_reference=ecoli_reference, path_to_bin=path_to_bin, constraint_trim=0, traceBackDiagonals=100, diagonal_expansion=0, embed=True) # get summaries all_event_summaries = get_all_event_summaries(fast5s, alignment_args, aln_dist_threshold=threshold, generate_plot=False, verbose=False) for fast5 in all_event_summaries.keys(): f5_name = os.path.basename(fast5) event_summaries = all_event_summaries[fast5] max_mea_aln_diff = max(list(map(lambda x: x[ABS_SA_ALIGNMENT_DIFF], list(filter(lambda x: x[MEA], event_summaries))))) self.assertTrue(max_mea_aln_diff <= threshold, "MEA produced alignment greater than {} positions from guide alignment for {}".format( max_mea_aln_diff, f5_name))
def test_processReferenceFasta_positions(self): with tempfile.TemporaryDirectory() as tempdir: work_folder = FolderHandler() work_folder.open_folder(os.path.join(tempdir, "test_outdir")) forward_ref, backward_ref = processReferenceFasta(self.reference, work_folder, motifs=[["AC", "EC"]], positions_file=None, name="") title, comment, seq = read_fasta(forward_ref).__next__() self.assertEqual(seq.find("AC"), -1) self.assertEqual(seq.find("EC"), 42) title, comment, seq = read_fasta(backward_ref).__next__() self.assertEqual(seq.find("AC"), -1) self.assertEqual(seq.find("EC"), 5)
def test_embed_with_both(self): signal_file_reads = os.path.join(self.HOME, "tests/minion_test_reads/pUC/") template_model = os.path.join( self.HOME, "models/testModelR9_5mer_acegt_template.model") complement_model = os.path.join( self.HOME, "models/testModelR9_5mer_acegt_complement.model") puc_reference = os.path.join(self.HOME, "tests/test_sequences/pUC19_SspI.fa") signal_file_guide_alignment = os.path.join( self.HOME, "tests/minion_test_reads/pUC/puc.bam") with tempfile.TemporaryDirectory() as tempdir: new_dir = os.path.join(tempdir, "new_dir") if os.path.exists(new_dir): shutil.rmtree(new_dir) working_folder = FolderHandler() working_folder.open_folder(os.path.join(tempdir, "test_dir")) shutil.copytree(signal_file_reads, new_dir) args = create_signalAlignment_args( alignment_file=signal_file_guide_alignment, bwa_reference=puc_reference, forward_reference=puc_reference, in_templateHmm=template_model, path_to_bin=self.path_to_bin, destination=working_folder.path, embed=True, output_format="both", filter_reads=0, twoD_chemistry=True, in_complementHmm=complement_model, delete_tmp=True) final_args = merge_dicts([ args, dict(in_fast5=os.path.join( new_dir, "makeson_PC_20160807_FNFAD20242_MN17284_sequencing_run_MA_470_R9_pUC_g_PCR_BC_08_07_16_93165_ch1_read176_strand.fast5" )) ]) handle = SignalAlignment(**final_args) handle.run() f5fh = Fast5( os.path.join( new_dir, "makeson_PC_20160807_FNFAD20242_MN17284_sequencing_run_MA_470_R9_pUC_g_PCR_BC_08_07_16_93165_ch1_read176_strand.fast5" )) mea = f5fh.get_signalalign_events(mea=True) sam = f5fh.get_signalalign_events(sam=True) self.assertEqual(mea[0]["raw_start"], 2879) self.assertEqual(sam[0], "0") self.assertEqual(len(os.listdir(working_folder.path)), 2)
def test_multithread_signal_alignment_samples(self): with tempfile.TemporaryDirectory() as tempdir: working_folder = FolderHandler() test_fast5 = os.path.join( tempdir, "miten_PC_20160820_FNFAD20259_MN17223_mux_scan_AMS_158_R9_WGA_Ecoli_08_20_16_83098_ch138_read23_strand.fast5" ) num_files = 1 copyfile(self.fast5_paths[0], test_fast5) working_folder.open_folder(os.path.join(tempdir, "test_dir")) # create signalalign args signal_align_arguments = create_signalAlignment_args( in_templateHmm=self.template_hmm, destination=working_folder.path, path_to_bin=self.path_to_bin) # create samples samples = [] options = create_sa_sample_args(fast5_dirs=[tempdir], name="some_name", fw_reference=self.ecoli_reference, bwa_reference=self.ecoli_reference, readdb=self.fast5_readdb, alignment_file=self.fast5_bam) samples.append( SignalAlignSample(working_folder=working_folder, **options)) options["name"] = "some_name2" samples.append( SignalAlignSample(working_folder=working_folder, **options)) # with captured_output() as (out, err): samples = multithread_signal_alignment_samples( samples, signal_align_arguments, 2) self.assertSetEqual(set([sample.name for sample in samples]), {'some_name', 'some_name2'}) for sample in samples: if sample.name == "some_name": self.assertEqual(len(sample.analysis_files), num_files) if sample.name == "some_name2": self.assertEqual(len(sample.analysis_files), num_files) for file_path in sample.analysis_files: self.assertTrue(os.path.isfile(file_path)) options["name"] = "some_name" samples.append( SignalAlignSample(working_folder=working_folder, **options)) self.assertRaises(AssertionError, multithread_signal_alignment_samples, samples, signal_align_arguments, 2)
def test_multithread_signal_alignment(self): with tempfile.TemporaryDirectory() as tempdir: working_folder = FolderHandler() working_folder.open_folder(os.path.join(tempdir, "test_dir")) # create signalalign args assert os.path.isfile(self.template_hmm) signal_align_arguments = create_signalAlignment_args( bwa_reference=self.ecoli_reference, in_templateHmm=self.template_hmm, destination=working_folder.path, forward_reference=self.ecoli_reference, path_to_bin=self.path_to_bin) fast5_files = self.fast5_paths[:1] with captured_output() as (out, err): output_files = multithread_signal_alignment( signal_align_arguments, fast5_files, 2, forward_reference=self.ecoli_reference) self.assertEqual(len(output_files), len(fast5_files)) # round 2 working_folder = FolderHandler() working_folder.open_folder(os.path.join(tempdir, "test_dir2")) # create signalalign args assert os.path.isfile(self.template_hmm) signal_align_arguments = create_signalAlignment_args( bwa_reference=self.ecoli_reference, in_templateHmm=self.template_hmm, destination=working_folder.path, forward_reference=self.ecoli_reference, path_to_bin=self.path_to_bin) filter_reads_generator = filter_reads_to_string_wrapper( filter_reads(self.bam, self.readdb, [self.test_dir])) output_files = multithread_signal_alignment( signal_align_arguments, [], 2, forward_reference=self.ecoli_reference, filter_reads_to_string_wrapper=filter_reads_generator, debug=True) self.assertEqual(len(output_files), 3)
def test_variant_calling_with_multiple_paths_rna(self): with tempfile.TemporaryDirectory() as tempdir: new_dir = os.path.join(tempdir, "new_dir") if os.path.exists(new_dir): shutil.rmtree(new_dir) working_folder = FolderHandler() working_folder.open_folder(os.path.join(tempdir, "test_dir")) shutil.copytree(self.test_dir_rna, new_dir) args = create_signalAlignment_args( alignment_file=self.rna_bam, bwa_reference=self.rna_reference, forward_reference=os.path.join( self.HOME, "tests/test_sequences/fake_rna_replace/forward.fake_rna_atg.fake_rna_ref.fa" ), backward_reference=os.path.join( self.HOME, "tests/test_sequences/fake_rna_replace/backward.fake_rna_atg.fake_rna_ref.fa" ), in_templateHmm=os.path.join( self.HOME, "models/fake_testModelR9p4_5mer_acfgt_RNA.model"), path_to_bin=self.path_to_bin, destination=working_folder.path, embed=False, output_format="full", filter_reads=0, twoD_chemistry=False, delete_tmp=True, degenerate="m6a", check_for_temp_file_existance=False) multithread_signal_alignment(args, list_dir(new_dir, ext="fast5"), worker_count=8, forward_reference=None, debug=True, filter_reads_to_string_wrapper=None) self.assertEqual(len(os.listdir(working_folder.path)), 2)
def test_SignalAlignSample(self): with tempfile.TemporaryDirectory() as tempdir: # create fast5 dir test_fast5 = os.path.join(tempdir, "test.fast5") copyfile(self.fast5_paths[0], test_fast5) # create fofn test_out = os.path.join(tempdir, "test.fofn") with open(test_out, 'w+') as fofn_file: print(test_fast5, file=fofn_file) test_args = create_sa_sample_args( fast5_dirs=[tempdir, tempdir], name="some_name", fofns=[test_out, test_out], fw_reference=self.ecoli_reference, bwa_reference=self.ecoli_reference) working_folder = FolderHandler() working_folder.open_folder(os.path.join(tempdir, "test_dir")) sample = SignalAlignSample(working_folder=working_folder, **test_args)
def estimate_params(fast5, binary_path="./estimateNanoporeParams", template_lookup_table="../models/testModelR9p4_acegt_template.model", complement_lookup_table="../models/testModelR9_complement_pop2.model", twoD=False, verbose=False): temp_folder = FolderHandler() temp_folder.open_folder("npParamEstimation") read_name = fast5.split("/")[-1][:-6] # get the name without the '.fast5' npRead_path = temp_folder.add_file_path(read_name + ".npRead") npRead_fasta = temp_folder.add_file_path(read_name + ".seq.fasta") if twoD: success, version, complement = get_npRead_2dseq_and_models(fast5=fast5, npRead_path=npRead_path, twod_read_path=npRead_fasta) # print(version, complement) else: success, version, complement = prepareOneD(fast5=fast5, npRead_path=npRead_path, oneD_read_path=npRead_fasta) # print(version, complement) if success is False: return False command = "{bin} -T {tLuT} -C {cLuT} -q {npRead}" \ "".format(bin=binary_path, tLuT=template_lookup_table, cLuT=complement_lookup_table, npRead=npRead_path) if verbose: print("running command {command}".format(command=command), file=sys.stderr) # os.system(command) result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) params = result.split() param_dict = dict(list(zip([bytes.decode(x) for x in params[::2]], [float(x) for x in params[1::2]]))) # print(type(param_dict["scale"])) # clean up temp folder temp_folder.remove_file(npRead_path) temp_folder.remove_file(npRead_fasta) temp_folder.remove_folder() return param_dict
def test_signal_file_and_alignment(self): signal_file_reads = os.path.join( self.HOME, "tests/minion_test_reads/no_event_data_1D_ecoli") template_model = os.path.join( self.HOME, "models/testModelR9p4_5mer_acegt_template.model") ecoli_reference = os.path.join( self.HOME, "tests/test_sequences/E.coli_K12.fasta") signal_file_guide_alignment = os.path.join( self.HOME, "tests/minion_test_reads/oneD_alignments.sam") with tempfile.TemporaryDirectory() as tempdir: new_dir = os.path.join(tempdir, "new_dir") working_folder = FolderHandler() working_folder.open_folder(os.path.join(tempdir, "test_dir")) shutil.copytree(signal_file_reads, new_dir) args = create_signalAlignment_args( alignment_file=signal_file_guide_alignment, bwa_reference=ecoli_reference, forward_reference=ecoli_reference, in_templateHmm=template_model, path_to_bin=self.path_to_bin, destination=working_folder.path) final_args = merge_dicts([ args, dict(in_fast5=os.path.join( new_dir, "LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch6_read347_strand.fast5" )) ]) handle = SignalAlignment(**final_args) handle.run() self.assertEqual(len(os.listdir(working_folder.path)), 1) self.assertEqual( sorted(os.listdir(working_folder.path))[0], "9e4d14b1-8167-44ef-9fdb-5c29dd0763fd.sm.backward.tsv")
def main(args): args = parse_args() # make directory to put temporary files temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder(args.out + "npParamEstimation") fast5s = [x for x in os.listdir(args.files_dir) if x.endswith(".fast5")] if len(fast5s) > args.nb_files: shuffle(fast5s) fast5s = fast5s[:args.nb_files] for fast5 in fast5s: print(fast5) # estimate_params(fast5=args.files_dir + fast5, working_folder=temp_folder, bwa_index=bwa_ref_index, # forward_reference_path=plus_strand_sequence, backward_reference_path=minus_strand_sequence, # threshold=args.threshold) try: params = estimate_params(fast5=args.files_dir + fast5, twoD=True) print(params) except Exception as e: print(e) temp_folder.remove_folder() return True
def test_get_sample_kmers(self): with tempfile.TemporaryDirectory() as tempdir: # create fast5 dir test_fast5 = os.path.join(tempdir, "test.fast5") copyfile(self.fast5_paths[0], test_fast5) # create fofn test_out = os.path.join(tempdir, "test.hdp.tsv") test_args = create_sa_sample_args( fast5_dirs=[tempdir], name="some_name", fw_reference=self.ecoli_reference, bwa_reference=self.ecoli_reference, number_of_kmer_assignments=1, probability_threshold=0, kmers_from_reference=False) working_folder = FolderHandler() working_folder.open_folder(os.path.join(tempdir, "test_dir")) sample = SignalAlignSample(working_folder=working_folder, **test_args) sample.analysis_files = [ self.assignment_file, self.assignment_file ] hdp_data_handle = CreateHdpTrainingData([sample], test_out, template=True, complement=False, verbose=False) kmers = hdp_data_handle.get_sample_kmers(sample) self.assertEqual( kmers, {x for x in all_string_permutations("ATGC", length=6)}) test_args = create_sa_sample_args( fast5_dirs=[tempdir], name="some_name", fw_reference=self.ecoli_reference, bwa_reference=self.ecoli_reference, number_of_kmer_assignments=1, probability_threshold=0, kmers_from_reference=False, motifs=[["ATGC", "ETGC"]]) working_folder = FolderHandler() working_folder.open_folder(os.path.join(tempdir, "test_dir")) sample = SignalAlignSample(working_folder=working_folder, **test_args) sample.analysis_files = [ self.assignment_file, self.assignment_file ] hdp_data_handle = CreateHdpTrainingData([sample], test_out, template=True, complement=False, verbose=False) kmers = hdp_data_handle.get_sample_kmers(sample) self.assertEqual( kmers, get_motif_kmers(["ATGC", "ETGC"], 6, alphabet="ATGC") | {x for x in all_string_permutations("ATGC", length=6)}) test_args = create_sa_sample_args( fast5_dirs=[tempdir], name="some_name", fw_reference=self.ecoli_reference, bwa_reference=self.ecoli_reference, number_of_kmer_assignments=1, probability_threshold=0, kmers_from_reference=True, motifs=[["ATGC", "ETGC"]]) working_folder = FolderHandler() working_folder.open_folder(os.path.join(tempdir, "test_dir")) sample = SignalAlignSample(working_folder=working_folder, **test_args) sample.analysis_files = [ self.assignment_file, self.assignment_file ] hdp_data_handle = CreateHdpTrainingData([sample], test_out, template=True, complement=False, verbose=False) kmers = hdp_data_handle.get_sample_kmers(sample) expected_kmers = set() for _, _, sequence in read_fasta(self.ecoli_reference): expected_kmers |= get_sequence_kmers(sequence, k=6, rev_comp=True) self.assertEqual( kmers, get_motif_kmers(["ATGC", "ETGC"], 6, alphabet="ATGC") | expected_kmers)
def main(args): # parse args args = parse_args() command_line = " ".join(sys.argv[:]) print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr) start_message = """ # Starting Empire Error-Correction # Aligning files from: {fileDir} # Aligning to reference: {reference} # Aligning maximum of {nbFiles} files # Using model: {model} # Using banding: {banding} # Aligning to regions in: {regions} # Non-default template HMM: {inThmm} # Non-default complement HMM: {inChmm} # Template HDP: {tHdp} # Complement HDP: {cHdp} """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, banding=args.banded, inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions, tHdp=args.templateHDP, cHdp=args.complementHDP) print(start_message, file=sys.stdout) if not os.path.isfile(args.ref): print("Did not find valid reference file", file=sys.stderr) sys.exit(1) temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder(args.out + "tempFiles_errorCorrection") reference_sequence = args.ref for cycle in range(0, args.cycles): check, reference_sequence_length = write_degenerate_reference_set( input_fasta=reference_sequence, out_path=temp_dir_path, step=STEP) assert check, "Problem making degenerate reference sequence set" # index the reference for bwa print("signalAlign - indexing reference", file=sys.stderr) bwa_ref_index = get_bwa_index(reference_sequence, temp_dir_path) print("signalAlign - indexing reference, done", file=sys.stderr) # setup workers for multiprocessing workers = args.nb_jobs work_queue = Manager().Queue() done_queue = Manager().Queue() jobs = [] # list of alignment files fast5s = [ x for x in os.listdir(args.files_dir) if x.endswith(".fast5") ] # take only some if args.nb_files < len(fast5s): shuffle(fast5s) fast5s = fast5s[:args.nb_files] for fast5 in fast5s: alignment_args = { "forward_reference": None, "backward_reference": None, "path_to_EC_refs": temp_dir_path, "destination": temp_dir_path, "stateMachineType": args.stateMachineType, "bwa_index": bwa_ref_index, "in_templateHmm": args.in_T_Hmm, "in_complementHmm": args.in_C_Hmm, "in_templateHdp": args.templateHDP, "in_complementHdp": args.complementHDP, "banded": args.banded, "sparse_output": True, "in_fast5": args.files_dir + fast5, "threshold": args.threshold, "diagonal_expansion": args.diag_expansion, "constraint_trim": args.constraint_trim, "target_regions": None, "degenerate": degenerate_enum(args.degenerate), } #alignment = SignalAlignment(**alignment_args) #alignment.run() work_queue.put(alignment_args) for w in range(workers): p = Process(target=aligner, args=(work_queue, done_queue)) p.start() jobs.append(p) work_queue.put('STOP') for p in jobs: p.join() done_queue.put('STOP') print("\n# signalAlign - finished alignments\n", file=sys.stderr) print("\n# signalAlign - finished alignments\n", file=sys.stdout) # working sequence is a string, that has the reference we're going to update this cycle working_sequence = get_first_sequence(reference_sequence) # register is the relative position that is being N-ed: # ACGTAGACAATA --> NCGTAGNCAATA = register 0 # ACGTAGACAATA --> ANGTAGANAATA = register 1 ... for register in range(0, STEP): print("# Starting Variant Calling, register: {}...".format( register), file=sys.stdout, end='\n') print("# Starting Variant Calling, register: {}...".format( register), file=sys.stderr, end='') # cull the alignment files for this register alns, forward_mask = get_alignments_labels_and_mask( path_to_alignments=temp_dir_path + "*.tsv.{}".format(register), max=args.nb_files, suffix=".{}".format(register)) # this is the list of positions that we're going to look at, based on this register degenerate_positions = { 'forward': list(range(register, reference_sequence_length, STEP)), 'backward': list(range(register, reference_sequence_length, STEP)) } # place to put the marginal probs variant_call_file = temp_folder.add_file_path( "variants.{cycle}.{reg}.calls".format(cycle=cycle, reg=register)) # arguments for multiprocessing for aln, forward_bool in zip(alns, forward_mask): call_methyl_args = { "sequence": None, "alignment_file": aln, "forward": forward_bool, "out_file": variant_call_file, "positions": degenerate_positions, "degenerate_type": degenerate_enum(args.degenerate), } #c = CallMethylation(**call_methyl_args) #c.write() work_queue.put(call_methyl_args) for w in range(workers): p = Process(target=run_methyl_caller, args=(work_queue, done_queue)) p.start() jobs.append(p) work_queue.put('STOP') for p in jobs: p.join() done_queue.put('STOP') # this is where the per-register update happens working_sequence = update_reference(variant_call_file, working_sequence, register, min_depth=0, get_sites=False) # remove alignments for this register for f in glob.glob(temp_dir_path + "*.tsv.{}".format(register)): os.remove(f) print("done", file=sys.stdout, end="\n") print("done", file=sys.stderr, end="\n") # add a file for this cycle ref_path = temp_folder.add_file_path( "iteration.{cycle}.fa".format(cycle=cycle)) # write it to a file write_fasta("iteration.{cycle}.fa".format(cycle=cycle), working_sequence, open(ref_path, 'w')) # update the path to the reference for the next cycle reference_sequence = ref_path return
def main(args): # parse args args = parse_args() command_line = " ".join(sys.argv[:]) print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr) # get absolute paths to inputs args.files_dir = resolvePath(args.files_dir) args.ref = resolvePath(args.ref) args.out = resolvePath(args.out) args.bwt = resolvePath(args.bwt) args.in_T_Hmm = resolvePath(args.in_T_Hmm) args.in_C_Hmm = resolvePath(args.in_C_Hmm) args.templateHDP = resolvePath(args.templateHDP) args.complementHDP = resolvePath(args.complementHDP) args.fofn = resolvePath(args.fofn) args.target_regions = resolvePath(args.target_regions) args.ambiguity_positions = resolvePath(args.ambiguity_positions) start_message = """ # Starting Signal Align # Aligning files from: {fileDir} # Aligning to reference: {reference} # Aligning maximum of {nbFiles} files # Using model: {model} # Using banding: True # Aligning to regions in: {regions} # Non-default template HMM: {inThmm} # Non-default complement HMM: {inChmm} # Template HDP: {tHdp} # Complement HDP: {cHdp} """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions, tHdp=args.templateHDP, cHdp=args.complementHDP) print(start_message, file=sys.stdout) if args.files_dir is None and args.fofn is None: print("Need to provide directory with .fast5 files of fofn", file=sys.stderr) sys.exit(1) if not os.path.isfile(args.ref): print("Did not find valid reference file, looked for it {here}".format(here=args.ref), file=sys.stderr) sys.exit(1) # make directory to put temporary files temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder(args.out + "/tempFiles_alignment") # forward_reference, backward_reference = processReferenceFasta(fasta=args.ref, motif_key=args.motif_key, work_folder=temp_folder, sub_char=args.ambig_char, positions_file=args.ambiguity_positions) # index the reference for bwa if args.bwt is not None: print("[RunSignalAlign]NOTICE - using provided BWT %s" % args.bwt) bwa_ref_index = args.bwt else: print("signalAlign - indexing reference", file=sys.stderr) bwa_ref_index = getBwaIndex(args.ref, temp_dir_path) print("signalAlign - indexing reference, done", file=sys.stderr) # setup workers for multiprocessing workers = args.nb_jobs work_queue = Manager().Queue() done_queue = Manager().Queue() jobs = [] # list of read files if args.fofn is not None: fast5s = [x for x in parseFofn(args.fofn) if x.endswith(".fast5")] else: fast5s = ["/".join([args.files_dir, x]) for x in os.listdir(args.files_dir) if x.endswith(".fast5")] nb_files = args.nb_files if nb_files < len(fast5s): shuffle(fast5s) fast5s = fast5s[:nb_files] # change paths to the source directory os.chdir(signalAlignSourceDir()) print("[runSignalAlign]:NOTICE: Got {} files to align".format(len(fast5s)), file=sys.stdout) for fast5 in fast5s: print(fast5) alignment_args = { "destination": temp_dir_path, "stateMachineType": args.stateMachineType, "bwa_index": bwa_ref_index, "in_templateHmm": args.in_T_Hmm, "in_complementHmm": args.in_C_Hmm, "in_templateHdp": args.templateHDP, "in_complementHdp": args.complementHDP, "output_format": args.outFmt, "in_fast5": fast5, "threshold": args.threshold, "diagonal_expansion": args.diag_expansion, "constraint_trim": args.constraint_trim, "degenerate": getDegenerateEnum(args.degenerate), "twoD_chemistry": args.twoD, "target_regions": args.target_regions, "embed": args.embed, "event_table": args.event_table, "backward_reference": backward_reference, "forward_reference": forward_reference } if args.DEBUG: alignment = SignalAlignment(**alignment_args) alignment.run() else: work_queue.put(alignment_args) for w in range(workers): p = Process(target=aligner, args=(work_queue, done_queue)) p.start() jobs.append(p) work_queue.put('STOP') for p in jobs: p.join() done_queue.put('STOP') print("\n# signalAlign - finished alignments\n", file=sys.stderr) print("\n# signalAlign - finished alignments\n", file=sys.stdout)
def main(args): # parse args args = parse_args() command_line = " ".join(sys.argv[:]) print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr) start_message = """ # Starting Zayante Error-Correction # Aligning files from: {fileDir} # Aligning to reference: {reference} # Aligning maximum of {nbFiles} files # Using model: {model} # Using banding: {banding} # Aligning to regions in: {regions} # Non-default template HMM: {inThmm} # Non-default complement HMM: {inChmm} # Template HDP: {tHdp} # Complement HDP: {cHdp} """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, banding=args.banded, inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions, tHdp=args.templateHDP, cHdp=args.complementHDP) print(start_message, file=sys.stdout) if not os.path.isfile(args.ref): print("Did not find valid reference file", file=sys.stderr) sys.exit(1) temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder(args.out + "tempFiles_errorCorrection") reference_sequence = args.ref STEP = 10 for cycle in range(0, 8): for it in range(0, STEP): # make paths for reference files forward_reference = temp_folder.add_file_path("forward_reference.{cycle}.{iter}.txt".format(cycle=cycle, iter=it)) backward_reference = temp_folder.add_file_path("backward_reference.{cycle}.{iter}.txt".format(cycle=cycle, iter=it)) # make N-ed reference sequence for this iteration deg, reference_sequence_length = make_degenerate_reference(reference_sequence, it, forward_reference, backward_reference, step=STEP) assert deg, "Problem making degenerate reference for cycle {cycle} iteration {iter}" \ "".format(cycle=cycle, iter=it) # index the reference for bwa print("signalAlign - indexing reference", file=sys.stderr) bwa_ref_index = get_bwa_index(args.ref, temp_dir_path) print("signalAlign - indexing reference, done", file=sys.stderr) # setup workers for multiprocessing workers = args.nb_jobs work_queue = Manager().Queue() done_queue = Manager().Queue() jobs = [] # list of alignment files fast5s = [x for x in os.listdir(args.files_dir) if x.endswith(".fast5")] # take only some if args.nb_files < len(fast5s): shuffle(fast5s) fast5s = fast5s[:args.nb_files] for fast5 in fast5s: alignment_args = { "forward_reference": forward_reference, "backward_reference": backward_reference, "path_to_EC_refs": None, "destination": temp_dir_path, "stateMachineType": args.stateMachineType, "bwa_index": bwa_ref_index, "in_templateHmm": args.in_T_Hmm, "in_complementHmm": args.in_C_Hmm, "in_templateHdp": args.templateHDP, "in_complementHdp": args.complementHDP, "banded": args.banded, "sparse_output": True, "in_fast5": args.files_dir + fast5, "threshold": args.threshold, "diagonal_expansion": args.diag_expansion, "constraint_trim": args.constraint_trim, "target_regions": None, "degenerate": degenerate_enum(args.degenerate), } #alignment = SignalAlignment(**alignment_args) #alignment.run() work_queue.put(alignment_args) for w in range(workers): p = Process(target=aligner, args=(work_queue, done_queue)) p.start() jobs.append(p) work_queue.put('STOP') for p in jobs: p.join() done_queue.put('STOP') print("\n# signalAlign - finished alignments\n", file=sys.stderr) print("\n# signalAlign - finished alignments\n", file=sys.stdout) print("\n# Starting Variant Calling\n", file=sys.stdout) print("\n# Starting Variant Calling\n", file=sys.stderr) # cull the alignment files alns, forward_mask = get_alignments_labels_and_mask(temp_dir_path + "*.tsv", args.nb_files) degenerate_positions = { 'forward': list(range(it, reference_sequence_length, STEP)), 'backward': list(range(it, reference_sequence_length, STEP)) } variant_call_file = temp_folder.add_file_path("variants.{cycle}.{iter}.calls".format(cycle=cycle, iter=it)) for aln, forward_bool in zip(alns, forward_mask): call_methyl_args = { "sequence": None, "alignment_file": aln, "forward": forward_bool, "out_file": variant_call_file, "positions": degenerate_positions, "degenerate_type": degenerate_enum(args.degenerate), } #c = CallMethylation(**call_methyl_args) #c.write() work_queue.put(call_methyl_args) for w in range(workers): p = Process(target=run_methyl_caller, args=(work_queue, done_queue)) p.start() jobs.append(p) work_queue.put('STOP') for p in jobs: p.join() done_queue.put('STOP') print("\n# Finished Variant Calling\n", file=sys.stdout) print("\n# Finished Variant Calling\n", file=sys.stderr) new_ref = update_reference(variant_call_file, reference_sequence, 0) ref_path = temp_folder.add_file_path("iteration.{cycle}.{iter}.fa".format(cycle=cycle, iter=it)) write_fasta("iteration.{cycle}.{iter}.fa".format(cycle=cycle, iter=it), new_ref, open(ref_path, 'w')) reference_sequence = ref_path # remove old alignments for f in glob.glob(temp_dir_path + "*.tsv"): os.remove(f) STEP -= 1 return
class SignalAlignment(object): def __init__(self, in_fast5, reference_map, destination, stateMachineType, bwa_index, in_templateHmm, in_complementHmm, in_templateHdp, in_complementHdp, threshold, diagonal_expansion, constraint_trim, degenerate, twoD_chemistry, target_regions=None, output_format="full"): self.in_fast5 = in_fast5 # fast5 file to align self.reference_map = reference_map # map with paths to reference sequences self.destination = destination # place where the alignments go, should already exist self.stateMachineType = stateMachineType # flag for signalMachine self.bwa_index = bwa_index # index of reference sequence self.threshold = threshold # min posterior probability to keep self.diagonal_expansion = diagonal_expansion # alignment algorithm param self.constraint_trim = constraint_trim # alignment algorithm param self.output_format = output_format # smaller output files self.degenerate = degenerate # set of nucleotides for degenerate characters self.twoD_chemistry = twoD_chemistry # flag for 2D sequencing runs self.temp_folder = FolderHandler( ) # object for holding temporary files (non-toil) self.read_name = self.in_fast5.split( "/")[-1][:-6] # get the name without the '.fast5' self.target_regions = target_regions self.output_formats = {"full": 0, "variantCaller": 1, "assignments": 2} if (in_templateHmm is not None) and os.path.isfile(in_templateHmm): self.in_templateHmm = in_templateHmm else: self.in_templateHmm = None if (in_complementHmm is not None) and os.path.isfile(in_complementHmm): self.in_complementHmm = in_complementHmm else: self.in_complementHmm = None # similarly for HDPs if (in_templateHdp is not None) and os.path.isfile(in_templateHdp): self.in_templateHdp = in_templateHdp else: self.in_templateHdp = None if (in_complementHdp is not None) and os.path.isfile(in_complementHdp): self.in_complementHdp = in_complementHdp else: self.in_complementHdp = None def run(self, get_expectations=False): print("[SignalAlignment.run]INFO: Starting on {read}".format( read=self.in_fast5), file=sys.stderr) if get_expectations: assert self.in_templateHmm is not None and self.in_complementHmm is not None,\ "Need HMM files for model training" # file checks if os.path.isfile(self.in_fast5) is False: print("[SignalAlignment.run]ERROR: Did not find .fast5 at{file}". format(file=self.in_fast5)) return False self.openTempFolder("tempFiles_%s" % self.read_name) npRead_ = self.addTempFilePath("temp_%s.npRead" % self.read_name) npRead = NanoporeRead(fast_five_file=self.in_fast5, twoD=self.twoD_chemistry) fH = open(npRead_, "w") ok = npRead.Write(parent_job=None, out_file=fH, initialize=True) fH.close() if not ok: self.failStop( "[SignalAlignment.run]File: %s did not pass initial checks" % self.read_name, npRead) return False read_label = npRead.read_label # use this to identify the read throughout read_fasta_ = self.addTempFilePath("temp_seq_%s.fa" % read_label) temp_samfile_ = self.addTempFilePath("temp_sam_file_%s.sam" % read_label) cigar_file_ = self.addTempFilePath("temp_cigar_%s.txt" % read_label) if self.twoD_chemistry: ok, version, pop1_complement = self.prepare_twod( nanopore_read=npRead, twod_read_path=read_fasta_) else: ok, version, _ = self.prepare_oned(nanopore_read=npRead, oned_read_path=read_fasta_) pop1_complement = None # add an indicator for the model being used if self.stateMachineType == "threeState": model_label = ".sm" stateMachineType_flag = "" elif self.stateMachineType == "threeStateHdp": model_label = ".sm3Hdp" stateMachineType_flag = "--sm3Hdp " if self.twoD_chemistry: assert (self.in_templateHdp is not None) and (self.in_complementHdp is not None), "Need to provide HDPs" else: assert self.in_templateHdp is not None, "Need to provide Template HDP" else: # make invalid stateMachine control? model_label = ".sm" stateMachineType_flag = "" guide_alignment = generateGuideAlignment( bwa_index=self.bwa_index, query=read_fasta_, temp_sam_path=temp_samfile_, target_regions=self.target_regions) ok = guide_alignment.validate(self.reference_map.keys()) if not ok: self.failStop("[SignalAlignment.run]ERROR getting guide alignment", npRead) return False cig_handle = open(cigar_file_, "w") cig_handle.write(guide_alignment.cigar + "\n") cig_handle.close() # next section makes the output file name with the format: /directory/for/files/file.model.orientation.tsv posteriors_file_path = '' # forward strand if guide_alignment.strand == "+": if self.output_format == "full": posteriors_file_path = self.destination + read_label + model_label + ".forward.tsv" elif self.output_format == "variantCaller": posteriors_file_path = self.destination + read_label + model_label + ".tsv" else: posteriors_file_path = self.destination + read_label + model_label + ".assignments" # backward strand if guide_alignment.strand == "-": if self.output_format == "full": posteriors_file_path = self.destination + read_label + model_label + ".backward.tsv" elif self.output_format == "variantCaller": posteriors_file_path = self.destination + read_label + model_label + ".tsv" else: posteriors_file_path = self.destination + read_label + model_label + ".assignments" # Alignment/Expectations routine path_to_signalAlign = "./signalMachine" # flags # input (match) models if self.in_templateHmm is None: self.in_templateHmm = defaultModelFromVersion(strand="template", version=version) if self.twoD_chemistry: if self.in_complementHmm is None: self.in_complementHmm = defaultModelFromVersion( strand="complement", version=version, pop1_complement=pop1_complement) assert self.in_templateHmm is not None if self.twoD_chemistry: if self.in_complementHmm is None: self.failStop( "[SignalAlignment.run]ERROR Need to have complement HMM for 2D analysis", npRead) return False template_model_flag = "-T {} ".format(self.in_templateHmm) if self.twoD_chemistry: complement_model_flag = "-C {} ".format(self.in_complementHmm) else: complement_model_flag = "" print( "[SignalALignment.run]NOTICE: template model {t} complement model {c}" "".format(t=self.in_templateHmm, c=self.in_complementHmm), file=sys.stderr) # reference sequences assert self.reference_map[ guide_alignment.reference_name]["forward"] is not None assert self.reference_map[ guide_alignment.reference_name]["backward"] is not None forward_reference = self.reference_map[ guide_alignment.reference_name]["forward"] backward_reference = self.reference_map[ guide_alignment.reference_name]["backward"] assert os.path.isfile(forward_reference) assert os.path.isfile(backward_reference) forward_ref_flag = "-f {f_ref} ".format(f_ref=forward_reference) backward_ref_flag = "-b {b_ref} ".format(b_ref=backward_reference) # input HDPs if (self.in_templateHdp is not None) or (self.in_complementHdp is not None): hdp_flags = "-v {tHdp_loc} ".format(tHdp_loc=self.in_templateHdp) if self.twoD_chemistry and self.in_complementHdp is not None: hdp_flags += "-w {cHdp_loc} ".format( cHdp_loc=self.in_complementHdp) else: hdp_flags = "" # threshold if self.threshold is not None: threshold_flag = "-D {threshold} ".format(threshold=self.threshold) else: threshold_flag = "" # diagonal expansion if self.diagonal_expansion is not None: diag_expansion_flag = "-x {expansion} ".format( expansion=self.diagonal_expansion) else: diag_expansion_flag = "" # constraint trim if self.constraint_trim is not None: trim_flag = "-m {trim} ".format(trim=self.constraint_trim) else: trim_flag = "" # output format if self.output_format not in self.output_formats.keys(): self.failStop( "[SignalAlignment.run]ERROR illegal outpur format selected %s" % self.output_format) return False out_fmt = "-s {fmt} ".format( fmt=self.output_formats[self.output_format]) # degenerate nucleotide information if self.degenerate is not None: degenerate_flag = "-o {} ".format(self.degenerate) else: degenerate_flag = "" if self.twoD_chemistry: twoD_flag = "--twoD" else: twoD_flag = "" # commands if get_expectations: template_expectations_file_path = self.destination + read_label + ".template.expectations" complement_expectations_file_path = self.destination + read_label + ".complement.expectations" command = \ "{vA} {td} {degen}{sparse}{model}{f_ref}{b_ref} -q {npRead} " \ "{t_model}{c_model}{thresh}{expansion}{trim} {hdp}-L {readLabel} -p {cigarFile} " \ "-t {templateExpectations} -c {complementExpectations}"\ .format(vA=path_to_signalAlign, model=stateMachineType_flag, f_ref=forward_ref_flag, b_ref=backward_ref_flag, cigarFile=cigar_file_, npRead=npRead_, readLabel=read_label, td=twoD_flag, templateExpectations=template_expectations_file_path, hdp=hdp_flags, complementExpectations=complement_expectations_file_path, t_model=template_model_flag, c_model=complement_model_flag, thresh=threshold_flag, expansion=diag_expansion_flag, trim=trim_flag, degen=degenerate_flag, sparse=out_fmt) else: print("read_label", read_label) command = \ "{vA} {td} {degen}{sparse}{model}{f_ref}{b_ref} -q {npRead} " \ "{t_model}{c_model}{thresh}{expansion}{trim} -p {cigarFile} " \ "-u {posteriors} {hdp}-L {readLabel}"\ .format(vA=path_to_signalAlign, model=stateMachineType_flag, sparse=out_fmt, f_ref=forward_ref_flag, b_ref=backward_ref_flag, cigarFile=cigar_file_, readLabel=read_label, npRead=npRead_, td=twoD_flag, t_model=template_model_flag, c_model=complement_model_flag, posteriors=posteriors_file_path, thresh=threshold_flag, expansion=diag_expansion_flag, trim=trim_flag, hdp=hdp_flags, degen=degenerate_flag) # run print("signalAlign - running command: ", command, end="\n", file=sys.stderr) os.system(command) self.temp_folder.remove_folder() return True def prepare_oned(self, nanopore_read, oned_read_path): try: read_file = open(oned_read_path, "w") fastaWrite(fileHandleOrFile=read_file, name=nanopore_read.read_label, seq=nanopore_read.template_read) version = nanopore_read.version read_file.close() nanopore_read.close() return True, version, False except Exception: return False, None, False def prepare_twod(self, nanopore_read, twod_read_path): # check for table to make 'assembled' 2D alignment table fasta with if nanopore_read.has2D_alignment_table is False: nanopore_read.close() return False, None, False fasta_handle = open(twod_read_path, "w") fastaWrite(fileHandleOrFile=fasta_handle, name=nanopore_read.read_label, seq=nanopore_read.alignment_table_sequence) if nanopore_read.complement_model_id == "complement_median68pA_pop1.model": pop1_complement = True else: pop1_complement = False version = nanopore_read.version fasta_handle.close() nanopore_read.close() return True, version, pop1_complement def openTempFolder(self, temp_dir): self.temp_folder.open_folder("%s%s" % (self.destination, temp_dir)) def addTempFilePath(self, path_to_add): return self.temp_folder.add_file_path(path_to_add) def failStop(self, message, nanopore_read=None): self.temp_folder.remove_folder() if nanopore_read is not None: nanopore_read.close() print(message, file=sys.stderr)
def main(args): # parse args args = parse_args(args) command_line = " ".join(sys.argv[:]) print("[singleNucleotideProbabilities] Command Line: {cmdLine}\n".format( cmdLine=command_line), file=sys.stderr) # first: see if we want to validate and return if args.validation_file is not None: if os.path.isfile(args.validation_file): validate_snp_file(args.validation_file, args.ref, print_sequences=True, print_summary=True) elif os.path.isdir(args.validation_file): validate_snp_directory(args.validation_file, args.ref, print_summary=False, move_files=False, make_plots=True, alignment_file_location=args.alignment_file) else: print("[error] got invalid validation location: {}".format( args.validation_file)) return 0 # get absolute paths to inputs args.files_dir = resolvePath(args.files_dir) args.fast5_glob = resolvePath(args.fast5_glob) args.ref = resolvePath(args.ref) args.out = resolvePath(args.out) args.in_T_Hmm = resolvePath(args.in_T_Hmm) args.in_C_Hmm = resolvePath(args.in_C_Hmm) args.templateHDP = resolvePath(args.templateHDP) args.complementHDP = resolvePath(args.complementHDP) args.target_regions = resolvePath(args.target_regions) args.alignment_file = resolvePath(args.alignment_file) # assert integers args.step_size = int(args.step_size) args.kmer_size = int(args.kmer_size) # get input glob input_glob = args.fast5_glob if args.fast5_glob is not None else os.path.join( args.files_dir, "*.fast5") start_message = """ # Single Nucleotide Probabilities # # Aligning files matching: {inputGlob} # Aligning to reference: {reference} # Aligning maximum of {nbFiles} files # Using model: {model} # Using banding: {banding} # Aligning to regions in: {regions} # Non-default template HMM: {inThmm} # Non-default complement HMM: {inChmm} # Template HDP: {tHdp} # Complement HDP: {cHdp} # Kmer size: {kmerSize} # Step size: {stepSize} # Alignment File: {alignmentFile} """.format(inputGlob=input_glob, reference=args.ref, banding=args.banded, nbFiles=args.nb_files, inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions, tHdp=args.templateHDP, cHdp=args.complementHDP, kmerSize=args.kmer_size, stepSize=args.step_size, alignmentFile=args.alignment_file) print(start_message, file=sys.stdout) # prep if not os.path.isdir(args.out): os.mkdir(args.out) # get fast5 locations and prune fast5s = glob.glob(input_glob) if args.nb_files is not None and args.nb_files < len(fast5s): print( "[singleNucleotideProbabilities] pruning {} fast5 files down to configured max {}" .format(len(fast5s), args.nb_files)) shuffle(fast5s) fast5s = fast5s[:args.nb_files] # get the (input) reference sequence if not os.path.isfile(args.ref): print( "[singleNucleotideProbabilities] Did not find valid reference file", file=sys.stderr) sys.exit(1) # make a working folder in the specified directory temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder( os.path.join(args.out, "tempFiles_errorCorrection")) # alignment args are the parameters to the HMM/HDP model, and don't change alignment_args = { # "path_to_EC_refs": None, "destination": temp_dir_path, "stateMachineType": args.stateMachineType, "bwa_reference": args.ref, "in_templateHmm": args.in_T_Hmm, "in_complementHmm": args.in_C_Hmm, "in_templateHdp": args.templateHDP, "in_complementHdp": args.complementHDP, "threshold": args.threshold, "diagonal_expansion": args.diag_expansion, "constraint_trim": args.constraint_trim, "target_regions": None, "degenerate": getDegenerateEnum("variant"), "alignment_file": args.alignment_file, 'track_memory_usage': False, 'get_expectations': False } # get the sites that have proposed edits print( "\n\n[singleNucleotideProbabilities] scanning for proposals with %d fast5s" % len(fast5s)) output_files = discover_single_nucleotide_probabilities( args, temp_folder, args.kmer_size, args.ref, fast5s, alignment_args, args.nb_jobs, args.step_size, output_directory=args.out) print("\n[singleNucleotideProbabilities] got {} output files:".format( len(output_files))) i = 0 for output_file in output_files: print("\t{}".format(output_file)) i += 1 if i > 10 and len(output_files) > 10: print("\t...") break #validation if len(output_files) != 0: validate_snp_directory(os.path.dirname(output_files[0]), args.ref, print_summary=True) print("\n\n[singleNucleotideProbabilities] fin\n") return 0
class SignalAlignment(object): def __init__( self, in_fast5, destination, stateMachineType, in_templateHmm, in_complementHmm, in_templateHdp, in_complementHdp, threshold, diagonal_expansion, constraint_trim, degenerate, forward_reference, backward_reference=None, # one of these needs to be set alignment_file=None, bwa_reference=None, # reasonable defaults twoD_chemistry=False, target_regions=None, output_format="full", embed=False, event_table=False, check_for_temp_file_existance=True, track_memory_usage=False, get_expectations=False, path_to_bin=''): self.in_fast5 = in_fast5 # fast5 file to align self.destination = destination # place where the alignments go, should already exist self.stateMachineType = stateMachineType # flag for signalMachine self.bwa_reference = bwa_reference # path to reference sequence to generate guide alignment self.threshold = threshold # min posterior probability to keep self.diagonal_expansion = diagonal_expansion # alignment algorithm param self.constraint_trim = constraint_trim # alignment algorithm param self.output_format = output_format # smaller output files self.degenerate = degenerate # set of nucleotides for degenerate characters self.twoD_chemistry = twoD_chemistry # flag for 2D sequencing runs self.temp_folder = FolderHandler( ) # object for holding temporary files (non-toil) self.read_name = self.in_fast5.split( "/")[-1][:-6] # get the name without the '.fast5' self.target_regions = target_regions self.output_formats = {"full": 0, "variantCaller": 1, "assignments": 2} self.embed = embed # embed the output into the fast5 file self.event_table = event_table # specify which event table to use to generate alignments self.backward_reference = backward_reference # fasta path to backward reference if modified bases are used self.forward_reference = forward_reference # fasta path to forward reference self.alignment_file = alignment_file # guide aligments will be gotten from here if set self.check_for_temp_file_existance = check_for_temp_file_existance # don't recreate if files exist self.track_memory_usage = track_memory_usage # has the 'time' program append mem usage stats to output self.max_memory_usage_kb = None self.read_label = None self.get_expectations = get_expectations # option to gather expectations of transitions and emissions self.path_to_signalMachine = os.path.join( path_to_bin, "signalMachine") # path to signalMachine assert os.path.exists( self.path_to_signalMachine), "Path to signalMachine does not exist" assert self.bwa_reference is not None or self.alignment_file is not None, \ "either 'bwa_reference' or 'alignment_file' argument is needed to generate cigar strings" if (in_templateHmm is not None) and os.path.isfile(in_templateHmm): self.in_templateHmm = in_templateHmm else: self.in_templateHmm = None if (in_complementHmm is not None) and os.path.isfile(in_complementHmm): self.in_complementHmm = in_complementHmm else: self.in_complementHmm = None # similarly for HDPs if (in_templateHdp is not None) and os.path.isfile(in_templateHdp): self.in_templateHdp = in_templateHdp else: self.in_templateHdp = None if (in_complementHdp is not None) and os.path.isfile(in_complementHdp): self.in_complementHdp = in_complementHdp else: self.in_complementHdp = None assert os.path.exists(self.destination), \ "Destination path does not exist: {}".format(self.destination) def run(self): print("[SignalAlignment.run] INFO: Starting on {read}".format( read=self.in_fast5)) if self.get_expectations: assert self.in_templateHmm is not None, "Need template HMM files for model training" if self.twoD_chemistry: assert self.in_complementHmm is not None, "Need compement HMM files for model training" if not os.path.isfile(self.in_fast5): print("[SignalAlignment.run] ERROR: Did not find .fast5 at{file}". format(file=self.in_fast5)) return False # prep self.openTempFolder("tempFiles_%s" % self.read_name) if self.twoD_chemistry: npRead = NanoporeRead2D(fast_five_file=self.in_fast5, event_table=self.event_table, initialize=True) else: npRead = NanoporeRead(fast_five_file=self.in_fast5, event_table=self.event_table, initialize=True) #todo need to validate / generate events and nucleotide read # read label read_label = npRead.read_label # use this to identify the read throughout self.read_label = read_label # nanopore read (event table, etc) npRead_ = self.addTempFilePath("temp_%s.npRead" % self.read_name) if not (self.check_for_temp_file_existance and os.path.isfile(npRead_)): # TODO is this totally f****d for RNA because of 3'-5' mapping? fH = open(npRead_, "w") ok = npRead.Write(out_file=fH, initialize=True) fH.close() if not ok: self.failStop( "[SignalAlignment.run] File: %s did not pass initial checks" % self.read_name, npRead) return False # nucleotide read read_fasta_ = self.addTempFilePath("temp_seq_%s.fa" % read_label) ok = self.write_nucleotide_read(npRead, read_fasta_) if not ok: print( "[SignalAlignment.run] Failed to write nucleotide read. Continuing execution." ) # alignment info cigar_file_ = self.addTempFilePath("temp_cigar_%s.txt" % read_label) temp_samfile_ = self.addTempFilePath("temp_sam_file_%s.sam" % read_label) strand = None reference_name = None if not (self.check_for_temp_file_existance and os.path.isfile(cigar_file_)): # need guide alignment to generate cigar file guide_alignment = None # get from alignment file if self.alignment_file is not None: guide_alignment = getGuideAlignmentFromAlignmentFile( self.alignment_file, read_name=read_label) if guide_alignment is None: print( "[SignalAlignment.run] read {} not found in {}".format( read_label, self.alignment_file)) # get from bwa if guide_alignment is None and self.bwa_reference is not None: guide_alignment = generateGuideAlignment( reference_fasta=self.bwa_reference, query=read_fasta_, temp_sam_path=temp_samfile_, target_regions=self.target_regions) if guide_alignment is None: print( "[SignalAlignment.run] read {} could not be aligned with BWA" .format(read_label)) # could not map if guide_alignment is None: self.failStop( "[SignalAlignment.run] ERROR getting guide alignment", npRead) return False # ensure valid if not guide_alignment.validate(): self.failStop( "[SignalAlignment.run] ERROR invalid guide alignment", npRead) return False strand = guide_alignment.strand reference_name = guide_alignment.reference_name # write cigar to file cig_handle = open(cigar_file_, "w") cig_handle.write(guide_alignment.cigar + "\n") cig_handle.close() # otherwise, get strand from file else: strand, reference_name = getInfoFromCigarFile(cigar_file_) # add an indicator for the model being used if self.stateMachineType == "threeState": model_label = ".sm" stateMachineType_flag = "" elif self.stateMachineType == "threeStateHdp": model_label = ".sm3Hdp" stateMachineType_flag = "--sm3Hdp " if self.twoD_chemistry: assert (self.in_templateHdp is not None) and (self.in_complementHdp is not None), "Need to provide HDPs" else: assert self.in_templateHdp is not None, "Need to provide Template HDP" else: # make invalid stateMachine control? model_label = ".sm" stateMachineType_flag = "" # next section makes the output file name with the format: /directory/for/files/file.model.orientation.tsv # forward strand if strand == "+": if self.output_format == "full": posteriors_file_path = os.path.join( self.destination, read_label + model_label + ".forward.tsv") elif self.output_format == "variantCaller": posteriors_file_path = os.path.join( self.destination, read_label + model_label + ".tsv") else: posteriors_file_path = os.path.join( self.destination, read_label + model_label + ".assignments.tsv") # backward strand elif strand == "-": if self.output_format == "full": posteriors_file_path = os.path.join( self.destination, read_label + model_label + ".backward.tsv") elif self.output_format == "variantCaller": posteriors_file_path = os.path.join( self.destination, read_label + model_label + ".tsv") else: posteriors_file_path = os.path.join( self.destination, read_label + model_label + ".assignments.tsv") # sanity check else: self.failStop( "[SignalAlignment.run] ERROR Unexpected strand {}".format( strand), npRead) return False # flags # input (match) models if self.in_templateHmm is None: self.in_templateHmm = defaultModelFromVersion( strand="template", version=npRead.version) if self.twoD_chemistry and self.in_complementHmm is None: pop1_complement = npRead.complement_model_id == "complement_median68pA_pop1.model" self.in_complementHmm = defaultModelFromVersion( strand="complement", version=npRead.version, pop1_complement=pop1_complement) assert self.in_templateHmm is not None if self.twoD_chemistry: if self.in_complementHmm is None: self.failStop( "[SignalAlignment.run] ERROR Need to have complement HMM for 2D analysis", npRead) return False template_model_flag = "-T {} ".format(self.in_templateHmm) if self.twoD_chemistry: complement_model_flag = "-C {} ".format(self.in_complementHmm) else: complement_model_flag = "" print( "[SignalAlignment.run] NOTICE: template model {t} complement model {c}" "".format(t=self.in_templateHmm, c=self.in_complementHmm)) # reference sequences assert os.path.isfile(self.forward_reference) forward_ref_flag = "-f {f_ref} ".format(f_ref=self.forward_reference) if self.backward_reference: assert os.path.isfile(self.backward_reference) backward_ref_flag = "-b {b_ref} ".format( b_ref=self.backward_reference) else: backward_ref_flag = "" # input HDPs if (self.in_templateHdp is not None) or (self.in_complementHdp is not None): hdp_flags = "-v {tHdp_loc} ".format(tHdp_loc=self.in_templateHdp) if self.twoD_chemistry and self.in_complementHdp is not None: hdp_flags += "-w {cHdp_loc} ".format( cHdp_loc=self.in_complementHdp) else: hdp_flags = "" # threshold if self.threshold is not None: threshold_flag = "-D {threshold} ".format(threshold=self.threshold) else: threshold_flag = "" # diagonal expansion if self.diagonal_expansion is not None: diag_expansion_flag = "-x {expansion} ".format( expansion=self.diagonal_expansion) else: diag_expansion_flag = "" # constraint trim if self.constraint_trim is not None: trim_flag = "-m {trim} ".format(trim=self.constraint_trim) else: trim_flag = "" # output format if self.output_format not in list(self.output_formats.keys()): self.failStop( "[SignalAlignment.run] ERROR illegal output format selected %s" % self.output_format) return False out_fmt = "-s {fmt} ".format( fmt=self.output_formats[self.output_format]) # degenerate nucleotide information if self.degenerate is not None: degenerate_flag = "-o {} ".format(self.degenerate) else: degenerate_flag = "" # twoD flag if self.twoD_chemistry: twoD_flag = "--twoD" else: twoD_flag = "" # commands if self.get_expectations: template_expectations_file_path = os.path.join( self.destination, read_label + ".template.expectations.tsv") complement_expectations_file_path = os.path.join( self.destination, read_label + ".complement.expectations.tsv") command = \ "{vA} {td} {degen}{sparse}{model} -q {npRead} " \ "{t_model}{c_model}{thresh}{expansion}{trim} {hdp}-L {readLabel} -p {cigarFile} " \ "-t {templateExpectations} -c {complementExpectations} -n {seq_name} {f_ref_fa} {b_ref_fa}" \ .format(vA=self.path_to_signalMachine, model=stateMachineType_flag, cigarFile=cigar_file_, npRead=npRead_, readLabel=read_label, td=twoD_flag, templateExpectations=template_expectations_file_path, hdp=hdp_flags, complementExpectations=complement_expectations_file_path, t_model=template_model_flag, c_model=complement_model_flag, thresh=threshold_flag, expansion=diag_expansion_flag, trim=trim_flag, degen=degenerate_flag, sparse=out_fmt, seq_name=reference_name, f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag) else: command = \ "{vA} {td} {degen}{sparse}{model} -q {npRead} " \ "{t_model}{c_model}{thresh}{expansion}{trim} -p {cigarFile} " \ "-u {posteriors} {hdp}-L {readLabel} -n {seq_name} {f_ref_fa} {b_ref_fa}" \ .format(vA=self.path_to_signalMachine, model=stateMachineType_flag, sparse=out_fmt, cigarFile=cigar_file_, readLabel=read_label, npRead=npRead_, td=twoD_flag, t_model=template_model_flag, c_model=complement_model_flag, posteriors=posteriors_file_path, thresh=threshold_flag, expansion=diag_expansion_flag, trim=trim_flag, hdp=hdp_flags, degen=degenerate_flag, seq_name=reference_name, f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag) # run print("[SignalAlignment.run] running command: ", command, end="\n") try: command = command.split() if self.track_memory_usage: mem_command = ['/usr/bin/time', '-f', '\\nDEBUG_MAX_MEM:%M\\n'] print( "[SignalAlignment.run] Prepending command to track mem usage: {}" .format(mem_command)) mem_command.extend(command) command = mem_command output = subprocess.check_output(command, stderr=subprocess.STDOUT) output = str(output).split("\\n") for line in output: print("[SignalAlignment.run] {}: {}".format( read_label, line)) if line.startswith("DEBUG_MAX_MEM"): self.max_memory_usage_kb = int(line.split(":")[1]) except Exception as e: print( "[SignalAlignment.run] exception ({}) running signalAlign: {}". format(type(e), e)) raise e # save to fast5 file (if appropriate) if self.embed: print("[SignalAlignment.run] embedding into Fast5 ") data = self.read_in_signal_align_tsv(posteriors_file_path, file_type=self.output_format) npRead = NanoporeRead(fast_five_file=self.in_fast5, twoD=self.twoD_chemistry, event_table=self.event_table) npRead.Initialize(None) signal_align_path = npRead.get_latest_basecall_edition( "/Analyses/SignalAlign_00{}", new=False) assert signal_align_path, "There is no path in Fast5 file: {}".format( "/Analyses/SignalAlign_00{}") output_path = npRead._join_path(signal_align_path, self.output_format) npRead.write_data(data, output_path) # Todo add attributes to signalalign output if self.output_format == "full": print( "[SignalAlignment.run] writing maximum expected alignment " ) alignment = mea_alignment_from_signal_align(None, events=data) mae_path = npRead._join_path(signal_align_path, "MEA_alignment_labels") events = npRead.get_template_events() if events: if strand == "-": minus = True else: minus = False labels = match_events_with_signalalign( sa_events=alignment, event_detections=np.asanyarray(npRead.template_events), minus=minus, rna=npRead.is_read_rna()) npRead.write_data(labels, mae_path) sam_string = str() if os.path.isfile(temp_samfile_): with open(temp_samfile_, 'r') as test: for line in test: sam_string += line sam_path = npRead._join_path(signal_align_path, "sam") # print(sam_string) npRead.write_data(data=sam_string, location=sam_path, compression=None) # self.temp_folder.remove_folder() return True def write_nucleotide_read(self, nanopore_read, file_path): try: with open(file_path, "w") as read_file: # get appropriate read if self.twoD_chemistry: # check for table to make 'assembled' 2D alignment table fasta with if not nanopore_read.has2D_alignment_table: nanopore_read.close() return False nucleotide_read = nanopore_read.alignment_table_sequence else: nucleotide_read = nanopore_read.template_read # write read fastaWrite(fileHandleOrFile=read_file, name=nanopore_read.read_label, seq=nucleotide_read) return True except Exception as e: print('[SignalAlignment.write_nucleotide_read] {} exception: {}'. format(type(e), str(e)), file=sys.stderr) return False def openTempFolder(self, temp_dir): self.temp_folder.open_folder(os.path.join(self.destination, temp_dir)) def addTempFilePath(self, path_to_add): return self.temp_folder.add_file_path(path_to_add) def failStop(self, message, nanopore_read=None): self.temp_folder.remove_folder() if nanopore_read is not None: nanopore_read.close() print(message) def read_in_signal_align_tsv(self, tsv_path, file_type): """Read in tsv file""" assert file_type in ("full", "assignments", "variantCaller") with open(tsv_path, 'r') as tsvin: if file_type == "full": dtype = [('contig', 'S10'), ('reference_index', int), ('reference_kmer', 'S5'), ('read_file', 'S57'), ('strand', 'S1'), ('event_index', int), ('event_mean', float), ('event_noise', float), ('event_duration', float), ('aligned_kmer', 'S5'), ('scaled_mean_current', float), ('scaled_noise', float), ('posterior_probability', float), ('descaled_event_mean', float), ('ont_model_mean', float), ('path_kmer', 'S5')] elif file_type == "assignments": dtype = [('k-mer', 'S10'), ('read_file', 'S57'), ('descaled_event_mean', float), ('posterior_probability', float)] else: dtype = [('event_index', int), ('reference_position', int), ('base', 'S6'), ('posterior_probability', float), ('strand', 'S1'), ('forward_mapped', int), ('read_file', 'S57')] event_table = np.loadtxt(tsvin, dtype=dtype) def remove_field_name(a, name): names = list(a.dtype.names) if name in names: names.remove(name) b = a[names] return b event_table = remove_field_name(event_table, "read_file") return event_table
class TrainSignalAlign(object): """A single class which takes in the only config file used for training and allows for users to train either the transitions or emissions of the HMM model """ # global hdp types for specific alphabets and 1D read options HDP_TYPES_ACEGOT = [ ("singleLevelFixed", 0), ("singleLevelPrior", 1), ("multisetFixed", 2), ("multisetPrior", 3), ("compFixed", 4), ("compPrior", 5), ("middleNtsFixed", 6), ("middleNtsPrior", 7), ("groupMultisetFixed", 8), ("groupMultisetPrior", 9), ] HDP_TYPES_1D = [("singleLevelPrior2", 10), ("multisetPrior2", 11), ("singleLevelFixedCanonical", 14)] HDP_TYPES_ACEGT = [ ("singleLevelPrior2", 10), ("multisetPrior2", 11), ] HDP_TYPES_ACGT = [("singleLevelFixedCanonical", 14)] HDP_TYPES_ACEGIT = [ ("multisetPriorEcoli", 12), ("singleLevelPriorEcoli", 13), ] def __init__(self, args): # TODO Need to create docs here """Initialize all objects the training routine may need""" # executable self.buildHdpUtil = None # HDP type self.int_hdp_type = None # load json and create dot dictionary of all the parameters self.args = args # check output directory self.args.output_dir = os.path.abspath(self.args.output_dir) assert os.path.exists(self.args.output_dir), "Output directory does not exist. " \ "output_dir: {}".format(self.args.output_dir) self.working_folder = FolderHandler() self.working_path = self.working_folder.open_folder( os.path.join(self.args.output_dir, "tempFiles_trainModels")) # create samples from self.args.samples self.samples = self._create_samples() # Current model paths self.template_hmm_model_path = self.args.template_hmm_model self.template_hdp_model_path = self.args.template_hdp_model self.complement_hmm_model_path = self.args.complement_hmm_model self.complement_hdp_model_path = self.args.complement_hdp_model # Current SignalHmm model objects self.complement_model = None self.template_model = None # globals for experiments self.path_to_bin = self.args.path_to_bin self.debug = self.args.debug self.two_d = self.args.two_d self.job_count = self.args.job_count # state machine type changes for SignalAlignment so it can expect an HDP or not self.state_machine_type = "threeState" self.kmer_length = None self.alphabet = None # check config file self._check_config() def _create_samples(self): """Create SignalAlignSample for each sample""" return [ SignalAlignSample(working_folder=self.working_folder, **s) for s in self.args.samples ] def new_working_folder(self, append): """Create new working folder in order to keep track of each new run of analysis""" self.working_folder = FolderHandler() self.working_path = self.working_folder.open_folder( os.path.join(self.args.output_dir, "tempFiles_trainModels_" + str(append))) def train_hdp(self): """Train hdp.... duh? :param outpath: output file path :param number_of_assignments: total number of assignments to collect FOR EACH GROUP :param build_alignment: path to alignment file :param num_alignments: number of alignments in alignment file :param threshold: :param verbose: :param path_to_bin :param twoD: :param hdp_type: Build Hdp, specify type, options: "Prior, Fixed, twoWay. twoWay is a Prior-type model (recommended)" # initial HDP :param template_model: Input template lookup table :param complement_model: Input complement lookup table # fixed concentration models :param base_gamma: :param middle_gamma: :param leaf_gamma: # gamma prior models :param base_alpha: :param base_beta: :param middle_alpha: :param middle_beta: :param leaf_alpha: :param leaf_beta: # gibbs sampling :param gibbs_samples: number of gibbs samples :param thinning: how many thinning draws? # sample grid :param grid_start: :param grid_end: :param grid_length: :param kmer_length: length of kmer :return: dictionary of hdp training options """ if self.args.hdp_args.built_alignments: assert os.path.isfile(self.args.hdp_args.built_alignments), \ "Build alignment file does not exist. {}".format(self.args.hdp_args.built_alignments) build_alignment_path = self.args.hdp_args.built_alignments num_alignments = count_lines_in_file(build_alignment_path) else: # set strands which will built template = True complement = False if self.two_d: complement = True # create instance hdp_data = CreateHdpTrainingData(self.samples, os.path.join( self.working_path, "buildAlignment.tsv"), template=template, complement=complement, verbose=self.debug) # write an hdp training file to path build_alignment_path = hdp_data.write_hdp_training_file() num_alignments = hdp_data.n_assignments verbose_flag = "--verbose " if self.debug is True else "" # create the output paths for the models template_hdp_location = os.path.join( self.working_path, "template." + self.args.hdp_args.hdp_type + ".nhdp") complement_hdp_location = None if self.two_d: one_d = None complement_hdp_location = os.path.join( self.working_path, "complement." + self.args.hdp_args.hdp_type + ".nhdp") else: one_d = '--oneD' # if we're making a HDP with fixed concentration parameters build_initial_hdp_command = "{buildHdpUtil} {verbose}-p {hdpType} -v {tHdpLoc} -w {cHdpLoc} -l {buildAln} " \ "-a {kmerLength} -n {gibbs_samples} -I {burnIn} -t {thin} -s {start} -e {end} " \ "-k {len} {oneD} -C {cL} -T {tL} " \ "-g {Ba} -r {Bb} -j {Ma} -y {Mb} -i {La} -u {Lb} -B {base} -M {middle} -L {leaf} " \ "".format(buildHdpUtil=self.buildHdpUtil, hdpType=self.int_hdp_type, tHdpLoc=template_hdp_location, cHdpLoc=complement_hdp_location, buildAln=build_alignment_path, gibbs_samples=self.args.hdp_args.gibbs_samples, burnIn=int(self.args.hdp_args.burnin_multiplier * num_alignments), thin=self.args.hdp_args.thinning, start=self.args.hdp_args.grid_start, end=self.args.hdp_args.grid_end, len=self.args.hdp_args.grid_length, verbose=verbose_flag, tL=self.template_hmm_model_path, cL=self.complement_hmm_model_path, kmerLength=self.kmer_length, oneD=one_d, Ba=self.args.hdp_args.base_alpha, Bb=self.args.hdp_args.base_beta, Ma=self.args.hdp_args.middle_alpha, Mb=self.args.hdp_args.middle_beta, La=self.args.hdp_args.leaf_alpha, Lb=self.args.hdp_args.leaf_beta, base=self.args.hdp_args.base_gamma, middle=self.args.hdp_args.middle_gamma, leaf=self.args.hdp_args.leaf_gamma) print("[[trainModels_buildHdpUtil] Command: {}\n".format( build_initial_hdp_command)) procs = Popen(build_initial_hdp_command.split(), stdout=sys.stdout, stderr=sys.stderr) procs.wait() print( "[trainModels_buildHdpUtil] - finished training HDP emissions routine" ) # check if the HDP created models assert os.path.exists( template_hdp_location ), "HDP training did not create template hdp model. {}".format( template_hdp_location) if complement_hdp_location: assert os.path.exists( complement_hdp_location ), "HDP training did not create complement hdp model. {}".format( complement_hdp_location) # set class parameters self.template_hdp_model_path = template_hdp_location self.complement_hdp_model_path = complement_hdp_location self.state_machine_type = "threeStateHdp" return self.template_hdp_model_path, self.complement_hdp_model_path def train_normal_hmm(self, transitions=True, emissions=False): """Train model transitions""" i = 0 # start iterating while i < self.args.transitions_args.iterations: # align all the samples self.run_signal_align( get_expectations=True, trim=self.args.transitions_args.training_bases) all_sample_files = merge_lists( [sample.analysis_files for sample in self.samples]) assert len( all_sample_files ) > 0, "Something failed in multithread signal alignment. We got no sample files" # load then normalize the expectations template_expectations_files = [ x for x in all_sample_files if x.endswith(".template.expectations.tsv") ] if len(template_expectations_files) > 0: self.template_model.add_and_normalize_expectations( files=template_expectations_files, hmm_file=self.template_hmm_model_path, update_transitions=transitions, update_emissions=emissions) if self.two_d: complement_expectations_files = [ x for x in all_sample_files if x.endswith(".complement.expectations.tsv") ] if len(complement_expectations_files) > 0: self.complement_model.add_and_normalize_expectations( files=complement_expectations_files, hmm_file=self.complement_model_path, update_transitions=transitions, update_emissions=emissions) # log the running likelihood if len(self.template_model.running_likelihoods) > 0 and \ (self.two_d and len(self.complement_model.running_likelihoods)) > 0: print( "[trainModels_transitions] {i}| {t_likelihood}\t{c_likelihood}" .format(t_likelihood=self.template_model. running_likelihoods[-1], c_likelihood=self.complement_model. running_likelihoods[-1], i=i)) if self.args.transitions_args.test and (len(self.template_model.running_likelihoods) >= 2) and \ (self.two_d and len(self.complement_model.running_likelihoods) >= 2): assert (self.template_model.running_likelihoods[-2] < self.template_model.running_likelihoods[ -1]) and \ (self.complement_model.running_likelihoods[-2] < self.complement_model.running_likelihoods[ -1]), "Testing: Likelihood error, went up" elif len(self.template_model.running_likelihoods) > 0: print("[trainModels_transitions] {i}| {t_likelihood}".format( t_likelihood=self.template_model.running_likelihoods[-1], i=i)) if self.args.transitions_args.test and (len( self.template_model.running_likelihoods) >= 2): assert (self.template_model.running_likelihoods[-2] < self.template_model.running_likelihoods[-1] ), "Testing: Likelihood error, went up" i += 1 print( "[trainModels_transitions] - finished training transitions routine" ) return self.template_hmm_model_path, self.complement_hmm_model_path def expectation_maximization_training(self): """Complete the entire pipeline of training a new HMM-HDP model Note: If expectation_maximization is set to true, both the transitions and hdp/hmm_emissions will be trained """ start = timer() if self.args.training.normal_emissions: print( "[trainModels] Training HMM emission distributions is not currently available." ) if self.args.training.expectation_maximization: for i in range(1, self.args.training.em_iterations + 1): print( "[trainModels] Training HMM transition distributions. iteration: {}" .format(i)) # first train the model transitions self.train_normal_hmm() print( "[trainModels] Running Assignment with new HMM transition distributions. " "iteration: {}".format(i)) # next get assignments self.run_signal_align() print( "[trainModels] Training HDP emission distributions. iteration: {}" .format(i)) # make new hdp self.train_hdp() print([sample.analysis_files for sample in self.samples]) print(self.template_hdp_model_path) print(self.template_hmm_model_path) print(self.complement_hmm_model_path) print(self.complement_hdp_model_path) # self.new_working_folder(append=str(i)) elif self.args.training.transitions or self.args.training.hdp_emissions: if self.args.training.transitions: print("[trainModels] Training HMM transition distributions.") # self.train_transitions() self.train_normal_hmm() if self.args.training.hdp_emissions: print("[trainModels] Training HDP emission distributions.") if not self.args.hdp_args.built_alignments: self.run_signal_align() self.train_hdp() else: raise AssertionError( "Must set one of the following to True. " "training.transitions: {}, training.hdp_emissions: {}, " "training.expectation_maximization: " "{}.".format(self.args.training.transitions, self.args.training.hdp_emissions, self.args.training.expectation_maximization)) stop = timer() print("[trainModels] Complete") print("Training Time = {} seconds".format(stop - start)) print(self.template_hmm_model_path, self.complement_hmm_model_path, self.template_hdp_model_path, self.complement_hdp_model_path) return self.template_hmm_model_path, self.complement_hmm_model_path, \ self.template_hdp_model_path, self.complement_hdp_model_path def load_hmm_models(self): """Load in the correct models depending on what is going to be trained. """ # load template model assert self.template_hmm_model_path, "Missing template model %s" % ( self.template_hmm_model_path) self.template_hmm_model_path = os.path.abspath( self.template_hmm_model_path) self.template_model = HmmModel(self.template_hmm_model_path) new_template_hmm = self.working_folder.add_file_path( "template_trained.hmm") copyfile(self.template_hmm_model_path, new_template_hmm) assert os.path.exists( new_template_hmm), "Problem copying default model to {}".format( new_template_hmm) self.template_hmm_model_path = new_template_hmm # set alphabet and kmer_length self.kmer_length = self.template_model.kmer_length self.alphabet = self.template_model.alphabet # load complement model if 2D if self.two_d: assert self.complement_hmm_model_path, "Missing complement model: {}".format( self.complement_hmm_model_path) self.complement_hmm_model_path = os.path.abspath( self.complement_hmm_model_path) self.complement_model = HmmModel(self.complement_hmm_model_path) new_complement_hmm = self.working_folder.add_file_path( "complement_trained.hmm") copyfile(self.complement_hmm_model_path, new_complement_hmm) assert os.path.exists( new_complement_hmm ), "Problem copying default model to {}".format(new_complement_hmm) self.complement_hmm_model_path = new_complement_hmm # make sure models match assert self.complement_model.kmer_length == self.template_model.kmer_length, \ "Template model and complement model kmer lengths do not match." \ " template: {} != complement: {}".format(self.complement_model.kmer_length, self.template_model.kmer_length) assert self.complement_model.alphabet == self.template_model.alphabet, \ "Template model and complement model alphabets do not match." \ " template: {} != complement: {}".format(self.complement_model.alphabet, self.template_model.alphabet) # get the input HDP models, if they can be found if self.template_hdp_model_path: self.state_machine_type = "threeStateHdp" assert os.path.exists(self.template_hdp_model_path), \ "Template HDP path not found {}".format(self.template_hdp_model_path) self.template_hdp_model_path = os.path.abspath( self.template_hdp_model_path) new_template_hdp = self.working_folder.add_file_path("{}".format( os.path.basename(self.template_hdp_model_path))) copyfile(self.template_hdp_model_path, new_template_hdp) self.complement_hdp_model_path = new_template_hdp # same for complement hdp if self.complement_hdp_model_path and self.two_d: assert os.path.exists(self.complement_hdp_model_path), \ "Complement HDP path not found {}".format(self.complement_hdp_model_path) self.complement_hdp_model_path = os.path.abspath( self.complement_hdp_model_path) new_complement_hdp = \ self.working_folder.add_file_path("{}".format(os.path.basename(self.complement_hdp_model_path))) copyfile(self.complement_hdp_model_path, new_complement_hdp) self.complement_hdp_model_path = new_complement_hdp def _check_train_transitions_config(self): assert isinstance(self.args.transitions_args.iterations, int), \ "args.transitions_args.iterations must be an integer. {}".format(self.args.transitions_args.iterations) assert isinstance(self.args.job_count, int), \ "args.job_count must be an integer. {}".format(self.args.job_count) def _check_train_hdp_config(self): """Check if the input parameters will for training the HDP.""" # make sure hdp type works with alphabet and 1D self.int_hdp_type = get_hdp_type(self.args.hdp_args.hdp_type) if not self.args.two_d: assert (self.args.hdp_args.hdp_type, self.int_hdp_type) in set(self.HDP_TYPES_1D), \ "HDP type is not compatible with 1D. {}: 1D types {}".format(self.args.hdp_type, self.HDP_TYPES_1D) if self.alphabet == "ACEGOT": assert (self.args.hdp_args.hdp_type, self.int_hdp_type) in set(self.HDP_TYPES_ACEGOT), \ "HDP type is not compatible with alphabet=ACEGOT." \ "Hdp_type: {}, ACEGOT HDP types: {}".format(self.args.hdp_type, self.HDP_TYPES_ACEGOT) elif self.alphabet == "ACEGIT": assert (self.args.hdp_args.hdp_type, self.int_hdp_type) in set(self.HDP_TYPES_ACEGIT), \ "HDP type is not compatible with alphabet=ACEGIT." \ "Hdp_type: {}, ACEGIT HDP types: {}".format(self.args.hdp_type, self.HDP_TYPES_ACEGIT) elif self.alphabet == "ACEGT": assert (self.args.hdp_args.hdp_type, self.int_hdp_type) in set(self.HDP_TYPES_ACEGT), \ "HDP type is not compatible with alphabet=ACEGT." \ "Hdp_type: {}, ACEGT HDP types: {}".format(self.args.hdp_type, self.HDP_TYPES_ACEGT) elif self.alphabet == "ACGT": assert (self.args.hdp_args.hdp_type, self.int_hdp_type) in set(self.HDP_TYPES_ACGT), \ "HDP type is not compatible with alphabet=ACGT." \ "Hdp_type: {}, ACGT HDP types: {}".format(self.args.hdp_type, self.HDP_TYPES_ACGT) else: raise AssertionError("Cannot create a HDP with proved alphabet") # check buildHdpUtil executable self.buildHdpUtil = os.path.join(self.args.path_to_bin, "./buildHdpUtil") assert (os.path.exists( self.buildHdpUtil)), "ERROR: Didn't find buildHdpUtil. {}".format( self.buildHdpUtil) # check other parameter inconsistencies if self.args.hdp_args.built_alignments: assert self.args.training.expectation_maximization is not True, "Cannot use 'built_alignments' file for " \ "EM training. Either set " \ "training.expectation_maximization to " \ "false or change " \ "hdp_args.built_alignments to null" assert os.path.isfile(self.args.hdp_args.built_alignments), \ "Build alignment file does not exist. {}".format(self.args.hdp_args.built_alignments) def _check_config(self): """Make sure training configuration file is correctly filled out""" # check model files and load HMM models into memory for training transitions self.load_hmm_models() # check path to bin assert os.path.isdir(self.path_to_bin), "path_to_bin does not exist. " \ "path_to_bin: {}".format(self.path_to_bin) # check if signalMachine is found assert os.path.exists(os.path.join(self.args.path_to_bin, "./signalMachine")), \ "ERROR: Didn't find signalMachine executable. {}".format(os.path.join(self.args.path_to_bin, "./signalMachine")) if self.args.training.transitions or self.args.training.expectation_maximization: self._check_train_transitions_config() if self.args.training.hdp_emissions or self.args.training.expectation_maximization: self._check_train_hdp_config() return self.args def run_signal_align(self, output_format="assignments", get_expectations=False, trim=False): """Run signal align with specified arguments""" alignment_args = create_signalAlignment_args( destination=self.working_path, stateMachineType=self.state_machine_type, in_templateHmm=self.template_hmm_model_path, in_complementHmm=self.complement_hmm_model_path, in_templateHdp=self.template_hdp_model_path, in_complementHdp=self.complement_hdp_model_path, diagonal_expansion=self.args.diagonal_expansion, constraint_trim=self.args.constraint_trim, twoD_chemistry=self.two_d, get_expectations=get_expectations, path_to_bin=self.path_to_bin, check_for_temp_file_existance=True, threshold=self.args.signal_alignment_args.threshold, track_memory_usage=self.args.signal_alignment_args. track_memory_usage, embed=self.args.signal_alignment_args.embed, event_table=self.args.signal_alignment_args.event_table, output_format=output_format) self.samples = multithread_signal_alignment_samples(self.samples, alignment_args, self.job_count, trim=trim) return self.samples
def main(args): # parse args args = parse_args() command_line = " ".join(sys.argv[:]) print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr) # get absolute paths to inputs args.files_dir = resolvePath(args.files_dir) args.forward_reference = resolvePath(args.forward_ref) args.backward_reference = resolvePath(args.backward_ref) args.out = resolvePath(args.out) args.bwa_reference = resolvePath(args.bwa_reference) args.in_T_Hmm = resolvePath(args.in_T_Hmm) args.in_C_Hmm = resolvePath(args.in_C_Hmm) args.templateHDP = resolvePath(args.templateHDP) args.complementHDP = resolvePath(args.complementHDP) args.fofn = resolvePath(args.fofn) args.target_regions = resolvePath(args.target_regions) args.ambiguity_positions = resolvePath(args.ambiguity_positions) start_message = """ # Starting Signal Align # Aligning files from: {fileDir} # Aligning to reference: {reference} # Aligning maximum of {nbFiles} files # Using model: {model} # Using banding: True # Aligning to regions in: {regions} # Non-default template HMM: {inThmm} # Non-default complement HMM: {inChmm} # Template HDP: {tHdp} # Complement HDP: {cHdp} """.format(fileDir=args.files_dir, reference=args.bwa_reference, nbFiles=args.nb_files, inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions, tHdp=args.templateHDP, cHdp=args.complementHDP) print(start_message, file=sys.stdout) if args.files_dir is None and args.fofn is None: print("Need to provide directory with .fast5 files of fofn", file=sys.stderr) sys.exit(1) if not os.path.isfile(args.bwa_reference): print("Did not find valid reference file, looked for it {here}".format( here=args.bwa_reference), file=sys.stderr) sys.exit(1) # make directory to put temporary files temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder(args.out + "/tempFiles_alignment") # if not args.forward_reference or not args.backward_reference: args.forward_reference, args.backward_reference = processReferenceFasta( fasta=args.bwa_reference, motifs=args.motifs, work_folder=temp_folder, positions_file=args.ambiguity_positions) # list of read files if args.fofn is not None: fast5s = [x for x in parseFofn(args.fofn) if x.endswith(".fast5")] else: fast5s = [ "/".join([args.files_dir, x]) for x in os.listdir(args.files_dir) if x.endswith(".fast5") ] nb_files = args.nb_files if nb_files < len(fast5s): shuffle(fast5s) fast5s = fast5s[:nb_files] # change paths to the source directory os.chdir(signalAlignSourceDir()) alignment_args = { "destination": temp_dir_path, "stateMachineType": args.stateMachineType, "bwa_reference": args.bwa_reference, "in_templateHmm": args.in_T_Hmm, "in_complementHmm": args.in_C_Hmm, "in_templateHdp": args.templateHDP, "in_complementHdp": args.complementHDP, "output_format": args.outFmt, "threshold": args.threshold, "diagonal_expansion": args.diag_expansion, "constraint_trim": args.constraint_trim, "degenerate": getDegenerateEnum(args.degenerate), "twoD_chemistry": args.twoD, "target_regions": args.target_regions, "embed": args.embed, "event_table": args.event_table, "backward_reference": args.backward_reference, "forward_reference": args.forward_reference, "alignment_file": None, "check_for_temp_file_existance": True, "track_memory_usage": False, "get_expectations": False, } print("[runSignalAlign]:NOTICE: Got {} files to align".format(len(fast5s)), file=sys.stdout) # setup workers for multiprocessing multithread_signal_alignment(alignment_args, fast5s, args.nb_jobs) print("\n# signalAlign - finished alignments\n", file=sys.stderr) print("\n# signalAlign - finished alignments\n", file=sys.stdout)
def trainModelTransitions(config): def process_sample(sample): options = dict(**DEFAULT_TRAINMODELS_OPTIONS) options.update(sample) if options["fast5_dir"] is None and options["fofn"] is None: raise RuntimeError( "Need to provide path to .fast5 files or file with filenames (fofn)" ) reference_map = processReferenceFasta( fasta=config["reference"], work_folder=working_folder, motif_key=options["motif"], sub_char=options["label"], positions_file=options["positions_file"]) if options["fast5_dir"] is not None: if options["fofn"] is not None: print( "WARNING Only using files is directory %s ignoring fofn %s" % (options["files_dir"], options["fofn"])) sample = Fast5Directory(options["fast5_dir"], reference_map) else: sample = FileOfFilenames(options["fofn"], reference_map) return sample # make directory to put the files we're using working_folder = FolderHandler() working_folder_path = working_folder.open_folder(config["output_dir"] + "temp_trainModels") samples = [process_sample(s) for s in config["samples"]] if config["bwt"] is not None: print("[trainModels]Using provided BWT") bwa_ref_index = config["bwt"] else: print("signalAlign - indexing reference", file=sys.stderr) bwa_ref_index = getBwaIndex(config["reference"], working_folder_path) print("signalAlign - indexing reference, done", file=sys.stderr) template_model_path = config["in_T_Hmm"] complement_model_path = config["in_C_Hmm"] assert os.path.exists(template_model_path) and os.path.exists(complement_model_path), \ "Missing input models %s and %s" % (template_model_path, complement_model_path) template_model = get_model(config["stateMachineType"], template_model_path) complement_model = get_model( config["stateMachineType"], complement_model_path) if config["twoD"] else None # get the input HDP, if we're using it if config["stateMachineType"] == "threeStateHdp": template_hdp = working_folder.add_file_path( "%s" % config["templateHdp"].split("/")[-1]) copyfile(config["templateHdp"], template_hdp) if config["twoD"]: complement_hdp = working_folder.add_file_path( "%s" % config["complementHdp"].split("/")[-1]) copyfile(config["complementHdp"], complement_hdp) else: complement_hdp = None else: template_hdp = None complement_hdp = None # make some paths to files to hold the HMMs template_hmm = working_folder.add_file_path("template_trained.hmm") complement_hmm = working_folder.add_file_path("complement_trained.hmm") trained_models = [template_hmm, complement_hmm] untrained_models = [template_model_path, complement_model_path] for default_model, trained_model in zip(untrained_models, trained_models): assert os.path.exists( default_model), "Didn't find default model {}".format( default_model) copyfile(default_model, trained_model) assert os.path.exists( trained_model), "Problem copying default model to {}".format( trained_model) # start iterating i = 0 while i < config["iterations"]: # first cull a set of files to get expectations on training_files = cull_training_files( samples=samples, training_amount=config["training_bases"], twoD=config["twoD"]) # setup workers = config["job_count"] work_queue = Manager().Queue() done_queue = Manager().Queue() jobs = [] # get expectations for all the files in the queue # file_ref_tuple should be (fast5, (plus_ref_seq, minus_ref_seq)) for fast5, ref_map in training_files: alignment_args = { "reference_map": ref_map, "destination": working_folder_path, "stateMachineType": config["stateMachineType"], "bwa_index": bwa_ref_index, "in_templateHmm": template_hmm, "in_complementHmm": complement_hmm, "in_templateHdp": template_hdp, "in_complementHdp": complement_hdp, "in_fast5": fast5, "threshold": 0.01, "diagonal_expansion": config["diagonal_expansion"], "constraint_trim": config["constraint_trim"], "target_regions": None, "degenerate": None, "twoD_chemistry": config["twoD"], } if config["DEBUG"]: alignment = SignalAlignment(**alignment_args) alignment.run(get_expectations=True) else: work_queue.put(alignment_args) for w in xrange(workers): p = Process(target=get_expectations, args=(work_queue, done_queue)) p.start() jobs.append(p) work_queue.put('STOP') for p in jobs: p.join() done_queue.put('STOP') # load then normalize the expectations template_expectations_files = [ x for x in os.listdir(working_folder_path) if x.endswith(".template.expectations") ] complement_expectations_files = [ x for x in os.listdir(working_folder_path) if x.endswith(".complement.expectations") ] if len(template_expectations_files) > 0: add_and_norm_expectations(path=working_folder_path, files=template_expectations_files, model=template_model, hmm_file=template_hmm, update_transitions=True) if config["twoD"] and len(complement_expectations_files) > 0: add_and_norm_expectations(path=working_folder_path, files=complement_expectations_files, model=complement_model, hmm_file=complement_hmm, update_transitions=True) # log the running likelihood if len(template_model.running_likelihoods) > 0 and \ (config["twoD"] and len(complement_model.running_likelihoods)) > 0: print("{i}| {t_likelihood}\t{c_likelihood}".format( t_likelihood=template_model.running_likelihoods[-1], c_likelihood=complement_model.running_likelihoods[-1], i=i)) if config["TEST"] and (len(template_model.running_likelihoods) >= 2) and \ (config["twoD"] and len(complement_model.running_likelihoods) >= 2): print("TESTING") assert (template_model.running_likelihoods[-2] < template_model.running_likelihoods[-1]) and \ (complement_model.running_likelihoods[-2] < complement_model.running_likelihoods[-1]), \ "Testing: Likelihood error, went up" i += 1 # if we're using HDP, trim the final Hmm (remove assignments) print("trainModels - finished training routine", file=sys.stdout) print("trainModels - finished training routine", file=sys.stderr)
def main(args): # parse args args = parse_args() command_line = " ".join(sys.argv[:]) print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr) start_message = """ # Starting BonnyDoon Error-Correction # Aligning files from: {fileDir} # Aligning to reference: {reference} # Aligning maximum of {nbFiles} files # Using model: {model} # Using banding: {banding} # Aligning to regions in: {regions} # Non-default template HMM: {inThmm} # Non-default complement HMM: {inChmm} # Template HDP: {tHdp} # Complement HDP: {cHdp} """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, banding=args.banded, inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions, tHdp=args.templateHDP, cHdp=args.complementHDP) print(start_message, file=sys.stdout) # cull the MinION files fast5s = cull_fast5_files(args.files_dir, args.nb_files) # get the (input) reference sequence if not os.path.isfile(args.ref): print("Did not find valid reference file", file=sys.stderr) sys.exit(1) reference_sequence_path = args.ref # unpack the reference sequence reference_sequence_string = get_first_sequence(reference_sequence_path) # make a working folder in the specified directory temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder(args.out + "tempFiles_errorCorrection") # index the reference for bwa this is a string with the path to the index bwa_ref_index = get_bwa_index(reference_sequence_path, temp_dir_path) # alignment args are the parameters to the HMM/HDP model, and don't change alignment_args = { "path_to_EC_refs": None, "destination": temp_dir_path, "stateMachineType": args.stateMachineType, "bwa_index": bwa_ref_index, "in_templateHmm": args.in_T_Hmm, "in_complementHmm": args.in_C_Hmm, "in_templateHdp": args.templateHDP, "in_complementHdp": args.complementHDP, "banded": args.banded, "sparse_output": True, "threshold": args.threshold, "diagonal_expansion": args.diag_expansion, "constraint_trim": args.constraint_trim, "target_regions": None, "degenerate": degenerate_enum(args.degenerate), } # get the sites that have proposed edits proposals = scan_for_proposals(temp_folder, STEP, reference_sequence_string, fast5s, alignment_args, args.nb_jobs) proposals = group_sites_in_window2([x[0] for x in proposals], 6) return
def main(args): # parse args args = parse_args() command_line = " ".join(sys.argv[:]) print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr) start_message = """ # Starting Jamison Error-Correction # Aligning files from: {fileDir} # Aligning to reference: {reference} # Aligning maximum of {nbFiles} files # Using model: {model} # Using banding: {banding} # Aligning to regions in: {regions} # Non-default template HMM: {inThmm} # Non-default complement HMM: {inChmm} # Template HDP: {tHdp} # Complement HDP: {cHdp} # Performing {cycles} cycles """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, banding=args.banded, inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions, tHdp=args.templateHDP, cHdp=args.complementHDP, cycles=args.cycles) print(start_message, file=sys.stdout) if not os.path.isfile(args.ref): print("Did not find valid reference file", file=sys.stderr) sys.exit(1) temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder(args.out + "tempFiles_errorCorrection") # initialize to input fasta reference_sequence_path = args.ref # list of alignment files fast5s = cull_fast5_files(args.files_dir, args.nb_files) for cycle in range(0, args.cycles): # index the reference for bwa this is a string with the path to the index bwa_ref_index = get_bwa_index(reference_sequence_path, temp_dir_path) # unpack the reference sequence reference_sequence_string = get_first_sequence(reference_sequence_path) alignment_args = { "path_to_EC_refs": None, "destination": temp_dir_path, "stateMachineType": args.stateMachineType, "bwa_index": bwa_ref_index, "in_templateHmm": args.in_T_Hmm, "in_complementHmm": args.in_C_Hmm, "in_templateHdp": args.templateHDP, "in_complementHdp": args.complementHDP, "banded": args.banded, "sparse_output": True, "threshold": args.threshold, "diagonal_expansion": args.diag_expansion, "constraint_trim": args.constraint_trim, "target_regions": None, "degenerate": degenerate_enum(args.degenerate), } proposals = scan_for_proposals(temp_folder, STEP, reference_sequence_string, fast5s, alignment_args, args.nb_jobs) proposals = group_sites_in_window(proposals, 6) print("Cycle {cycle} - Got {nb} sites to check: {sites}".format( nb=len(proposals), sites=proposals, cycle=cycle)) updated_reference_string = update_reference_with_marginal_probs( temp_folder, proposals, reference_sequence_string, fast5s, alignment_args, args.nb_jobs) updated_reference_path = temp_folder.add_file_path( "cycle_snapshot.{cycle}.fa".format(cycle=cycle)) write_fasta("jamison{}".format(cycle), updated_reference_string, open(updated_reference_path, 'w')) reference_sequence_path = updated_reference_path # copy final file copyfile(reference_sequence_path, temp_dir_path + args.corrected) return
def main(args): # parse args start = timer() args = parse_args() if args.command == "run": if not os.path.exists(args.config): print("{config} not found".format(config=args.config)) exit(1) # run training config_args = create_dot_dict(load_json(args.config)) temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder( os.path.join(os.path.abspath(config_args.output_dir), "tempFiles_alignment")) temp_dir_path = resolvePath(temp_dir_path) print(config_args.output_dir) print(temp_dir_path) sa_args = [ merge_dicts([ s, { "quality_threshold": config_args.filter_reads, "workers": config_args.job_count } ]) for s in config_args.samples ] samples = [ SignalAlignSample(working_folder=temp_folder, **s) for s in sa_args ] copyfile(args.config, os.path.join(temp_dir_path, os.path.basename(args.config))) state_machine_type = "threeState" if config_args.template_hdp_model_path is not None: state_machine_type = "threeStateHdp" alignment_args = create_signalAlignment_args( destination=temp_dir_path, stateMachineType=state_machine_type, in_templateHmm=resolvePath(config_args.template_hmm_model), in_complementHmm=resolvePath(config_args.complement_hmm_model), in_templateHdp=resolvePath(config_args.template_hdp_model), in_complementHdp=resolvePath(config_args.complement_hdp_model), diagonal_expansion=config_args.diagonal_expansion, constraint_trim=config_args.constraint_trim, traceBackDiagonals=config_args.traceBackDiagonals, twoD_chemistry=config_args.two_d, get_expectations=False, path_to_bin=resolvePath(config_args.path_to_bin), check_for_temp_file_existance=True, threshold=config_args.signal_alignment_args.threshold, track_memory_usage=config_args.signal_alignment_args. track_memory_usage, embed=config_args.signal_alignment_args.embed, event_table=config_args.signal_alignment_args.event_table, output_format=config_args.signal_alignment_args.output_format, filter_reads=config_args.filter_reads, delete_tmp=config_args.signal_alignment_args.delete_tmp) multithread_signal_alignment_samples(samples, alignment_args, config_args.job_count, trim=None, debug=config_args.debug) print("\n# signalAlign - finished alignments\n", file=sys.stderr) print("\n# signalAlign - finished alignments\n", file=sys.stdout) stop = timer() else: command_line = " ".join(sys.argv[:]) print(os.getcwd()) print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr) # get absolute paths to inputs args.files_dir = resolvePath(args.files_dir) args.forward_reference = resolvePath(args.forward_ref) args.backward_reference = resolvePath(args.backward_ref) args.out = resolvePath(args.out) args.bwa_reference = resolvePath(args.bwa_reference) args.in_T_Hmm = resolvePath(args.in_T_Hmm) args.in_C_Hmm = resolvePath(args.in_C_Hmm) args.templateHDP = resolvePath(args.templateHDP) args.complementHDP = resolvePath(args.complementHDP) args.fofn = resolvePath(args.fofn) args.target_regions = resolvePath(args.target_regions) args.ambiguity_positions = resolvePath(args.ambiguity_positions) args.alignment_file = resolvePath(args.alignment_file) start_message = """ # Starting Signal Align # Aligning files from: {fileDir} # Aligning to reference: {reference} # Aligning maximum of {nbFiles} files # Using model: {model} # Using banding: True # Aligning to regions in: {regions} # Non-default template HMM: {inThmm} # Non-default complement HMM: {inChmm} # Template HDP: {tHdp} # Complement HDP: {cHdp} """.format(fileDir=args.files_dir, reference=args.bwa_reference, nbFiles=args.nb_files, inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions, tHdp=args.templateHDP, cHdp=args.complementHDP) print(start_message, file=sys.stdout) if args.files_dir is None and args.fofn is None: print("Need to provide directory with .fast5 files of fofn", file=sys.stderr) sys.exit(1) if not os.path.isfile(args.bwa_reference): print("Did not find valid reference file, looked for it {here}". format(here=args.bwa_reference), file=sys.stderr) sys.exit(1) # make directory to put temporary files if not os.path.isdir(args.out): print("Creating output directory: {}".format(args.out), file=sys.stdout) os.mkdir(args.out) temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder( os.path.join(os.path.abspath(args.out), "tempFiles_alignment")) temp_dir_path = resolvePath(temp_dir_path) print(args.out) print(temp_dir_path) # generate reference sequence if not specified if not args.forward_reference or not args.backward_reference: args.forward_reference, args.backward_reference = processReferenceFasta( fasta=args.bwa_reference, work_folder=temp_folder, positions_file=args.ambiguity_positions, name="") # list of read files if args.fofn is not None: fast5s = [x for x in parseFofn(args.fofn) if x.endswith(".fast5")] else: fast5s = [ "/".join([args.files_dir, x]) for x in os.listdir(args.files_dir) if x.endswith(".fast5") ] nb_files = args.nb_files if nb_files < len(fast5s): shuffle(fast5s) fast5s = fast5s[:nb_files] # return alignment_args alignment_args = { "destination": temp_dir_path, "stateMachineType": args.stateMachineType, "bwa_reference": args.bwa_reference, "in_templateHmm": args.in_T_Hmm, "in_complementHmm": args.in_C_Hmm, "in_templateHdp": args.templateHDP, "in_complementHdp": args.complementHDP, "output_format": args.outFmt, "threshold": args.threshold, "diagonal_expansion": args.diag_expansion, "constraint_trim": args.constraint_trim, "degenerate": getDegenerateEnum(args.degenerate), "twoD_chemistry": args.twoD, "target_regions": args.target_regions, "embed": args.embed, "event_table": args.event_table, "backward_reference": args.backward_reference, "forward_reference": args.forward_reference, "alignment_file": args.alignment_file, "check_for_temp_file_existance": True, "track_memory_usage": False, "get_expectations": False, "perform_kmer_event_alignment": args.perform_kmer_event_alignment, "enforce_supported_versions": args.enforce_supported_versions, "filter_reads": 7 if args.filter_reads else None, "path_to_bin": args.path_to_bin, "delete_tmp": args.delete_tmp } filter_read_generator = None if args.filter_reads is not None and args.alignment_file and args.readdb and args.files_dir: print("[runSignalAlign]:NOTICE: Filtering out low quality reads", file=sys.stdout) filter_read_generator = filter_reads_to_string_wrapper( filter_reads(args.alignment_file, args.readdb, [args.files_dir], quality_threshold=7, recursive=args.recursive)) print("[runSignalAlign]:NOTICE: Got {} files to align".format( len(fast5s)), file=sys.stdout) # setup workers for multiprocessing multithread_signal_alignment( alignment_args, fast5s, args.nb_jobs, debug=args.DEBUG, filter_reads_to_string_wrapper=filter_read_generator) stop = timer() print("\n# signalAlign - finished alignments\n", file=sys.stderr) print("\n# signalAlign - finished alignments\n", file=sys.stdout) print("[signalAlign] Complete") print("Running Time = {} seconds".format(stop - start))
class SignalAlignment(object): def __init__(self, in_fast5, destination, stateMachineType, bwa_index, in_templateHmm, in_complementHmm, in_templateHdp, in_complementHdp, threshold, diagonal_expansion, constraint_trim, degenerate, twoD_chemistry, forward_reference, backward_reference=None, target_regions=None, output_format="full", embed=False, event_table=False): self.in_fast5 = in_fast5 # fast5 file to align self.destination = destination # place where the alignments go, should already exist self.stateMachineType = stateMachineType # flag for signalMachine self.bwa_index = bwa_index # index of reference sequence self.threshold = threshold # min posterior probability to keep self.diagonal_expansion = diagonal_expansion # alignment algorithm param self.constraint_trim = constraint_trim # alignment algorithm param self.output_format = output_format # smaller output files self.degenerate = degenerate # set of nucleotides for degenerate characters self.twoD_chemistry = twoD_chemistry # flag for 2D sequencing runs self.temp_folder = FolderHandler( ) # object for holding temporary files (non-toil) self.read_name = self.in_fast5.split( "/")[-1][:-6] # get the name without the '.fast5' self.target_regions = target_regions self.output_formats = {"full": 0, "variantCaller": 1, "assignments": 2} self.embed = embed # embed the output into the fast5 file self.event_table = event_table # specify which event table to use to generate alignments self.backward_reference = backward_reference # fasta path to backward reference if modified bases are used self.forward_reference = forward_reference # fasta path to forward reference if (in_templateHmm is not None) and os.path.isfile(in_templateHmm): self.in_templateHmm = in_templateHmm else: self.in_templateHmm = None if (in_complementHmm is not None) and os.path.isfile(in_complementHmm): self.in_complementHmm = in_complementHmm else: self.in_complementHmm = None # similarly for HDPs if (in_templateHdp is not None) and os.path.isfile(in_templateHdp): self.in_templateHdp = in_templateHdp else: self.in_templateHdp = None if (in_complementHdp is not None) and os.path.isfile(in_complementHdp): self.in_complementHdp = in_complementHdp else: self.in_complementHdp = None def run(self, get_expectations=False): print("[SignalAlignment.run]INFO: Starting on {read}".format( read=self.in_fast5), file=sys.stderr) if get_expectations: assert self.in_templateHmm is not None and self.in_complementHmm is not None, \ "Need HMM files for model training" # file checks if os.path.isfile(self.in_fast5) is False: print("[SignalAlignment.run]ERROR: Did not find .fast5 at{file}". format(file=self.in_fast5)) return False self.openTempFolder("tempFiles_%s" % self.read_name) npRead_ = self.addTempFilePath("temp_%s.npRead" % self.read_name) # TODO is this totally f****d for RNA because of 3'-5' mapping? npRead = NanoporeRead(fast_five_file=self.in_fast5, twoD=self.twoD_chemistry, event_table=self.event_table) fH = open(npRead_, "w") ok = npRead.Write(parent_job=None, out_file=fH, initialize=True) fH.close() if not ok: self.failStop( "[SignalAlignment.run]File: %s did not pass initial checks" % self.read_name, npRead) return False read_label = npRead.read_label # use this to identify the read throughout read_fasta_ = self.addTempFilePath("temp_seq_%s.fa" % read_label) temp_samfile_ = self.addTempFilePath("temp_sam_file_%s.sam" % read_label) cigar_file_ = self.addTempFilePath("temp_cigar_%s.txt" % read_label) if self.twoD_chemistry: ok, version, pop1_complement = self.prepare_twod( nanopore_read=npRead, twod_read_path=read_fasta_) else: ok, version, _ = self.prepare_oned(nanopore_read=npRead, oned_read_path=read_fasta_) pop1_complement = None # add an indicator for the model being used if self.stateMachineType == "threeState": model_label = ".sm" stateMachineType_flag = "" elif self.stateMachineType == "threeStateHdp": model_label = ".sm3Hdp" stateMachineType_flag = "--sm3Hdp " if self.twoD_chemistry: assert (self.in_templateHdp is not None) and (self.in_complementHdp is not None), "Need to provide HDPs" else: assert self.in_templateHdp is not None, "Need to provide Template HDP" else: # make invalid stateMachine control? model_label = ".sm" stateMachineType_flag = "" guide_alignment = generateGuideAlignment( bwa_index=self.bwa_index, query=read_fasta_, temp_sam_path=temp_samfile_, target_regions=self.target_regions) # ok = guide_alignment.validate(list(self.reference_map.keys())) ok = guide_alignment.validate() if not ok: self.failStop("[SignalAlignment.run]ERROR getting guide alignment", npRead) return False cig_handle = open(cigar_file_, "w") cig_handle.write(guide_alignment.cigar + "\n") cig_handle.close() # next section makes the output file name with the format: /directory/for/files/file.model.orientation.tsv posteriors_file_path = '' # forward strand if guide_alignment.strand == "+": if self.output_format == "full": posteriors_file_path = self.destination + read_label + model_label + ".forward.tsv" elif self.output_format == "variantCaller": posteriors_file_path = self.destination + read_label + model_label + ".tsv" else: posteriors_file_path = self.destination + read_label + model_label + ".assignments" # backward strand if guide_alignment.strand == "-": if self.output_format == "full": posteriors_file_path = self.destination + read_label + model_label + ".backward.tsv" elif self.output_format == "variantCaller": posteriors_file_path = self.destination + read_label + model_label + ".tsv" else: posteriors_file_path = self.destination + read_label + model_label + ".assignments" # Alignment/Expectations routine path_to_signalAlign = "./signalMachine" # flags # input (match) models if self.in_templateHmm is None: self.in_templateHmm = defaultModelFromVersion(strand="template", version=version) if self.twoD_chemistry: if self.in_complementHmm is None: self.in_complementHmm = defaultModelFromVersion( strand="complement", version=version, pop1_complement=pop1_complement) assert self.in_templateHmm is not None if self.twoD_chemistry: if self.in_complementHmm is None: self.failStop( "[SignalAlignment.run]ERROR Need to have complement HMM for 2D analysis", npRead) return False template_model_flag = "-T {} ".format(self.in_templateHmm) if self.twoD_chemistry: complement_model_flag = "-C {} ".format(self.in_complementHmm) else: complement_model_flag = "" print( "[SignalALignment.run]NOTICE: template model {t} complement model {c}" "".format(t=self.in_templateHmm, c=self.in_complementHmm), file=sys.stderr) # reference sequences assert os.path.isfile(self.forward_reference) forward_ref_flag = "-f {f_ref} ".format(f_ref=self.forward_reference) if self.backward_reference: assert os.path.isfile(self.backward_reference) backward_ref_flag = "-b {b_ref} ".format( b_ref=self.backward_reference) else: backward_ref_flag = "" # input HDPs if (self.in_templateHdp is not None) or (self.in_complementHdp is not None): hdp_flags = "-v {tHdp_loc} ".format(tHdp_loc=self.in_templateHdp) if self.twoD_chemistry and self.in_complementHdp is not None: hdp_flags += "-w {cHdp_loc} ".format( cHdp_loc=self.in_complementHdp) else: hdp_flags = "" # threshold if self.threshold is not None: threshold_flag = "-D {threshold} ".format(threshold=self.threshold) else: threshold_flag = "" # diagonal expansion if self.diagonal_expansion is not None: diag_expansion_flag = "-x {expansion} ".format( expansion=self.diagonal_expansion) else: diag_expansion_flag = "" # constraint trim if self.constraint_trim is not None: trim_flag = "-m {trim} ".format(trim=self.constraint_trim) else: trim_flag = "" # output format if self.output_format not in list(self.output_formats.keys()): self.failStop( "[SignalAlignment.run]ERROR illegal output format selected %s" % self.output_format) return False out_fmt = "-s {fmt} ".format( fmt=self.output_formats[self.output_format]) # degenerate nucleotide information if self.degenerate is not None: degenerate_flag = "-o {} ".format(self.degenerate) else: degenerate_flag = "" if self.twoD_chemistry: twoD_flag = "--twoD" else: twoD_flag = "" # commands if get_expectations: template_expectations_file_path = self.destination + read_label + ".template.expectations" complement_expectations_file_path = self.destination + read_label + ".complement.expectations" command = \ "{vA} {td} {degen}{sparse}{model} -q {npRead} " \ "{t_model}{c_model}{thresh}{expansion}{trim} {hdp}-L {readLabel} -p {cigarFile} " \ "-t {templateExpectations} -c {complementExpectations} -n {seq_name} {f_ref_fa} {b_ref_fa}" \ .format(vA=path_to_signalAlign, model=stateMachineType_flag, cigarFile=cigar_file_, npRead=npRead_, readLabel=read_label, td=twoD_flag, templateExpectations=template_expectations_file_path, hdp=hdp_flags, complementExpectations=complement_expectations_file_path, t_model=template_model_flag, c_model=complement_model_flag, thresh=threshold_flag, expansion=diag_expansion_flag, trim=trim_flag, degen=degenerate_flag, sparse=out_fmt, seq_name=guide_alignment.reference_name, f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag) else: command = \ "{vA} {td} {degen}{sparse}{model} -q {npRead} " \ "{t_model}{c_model}{thresh}{expansion}{trim} -p {cigarFile} " \ "-u {posteriors} {hdp}-L {readLabel} -n {seq_name} {f_ref_fa} {b_ref_fa}" \ .format(vA=path_to_signalAlign, model=stateMachineType_flag, sparse=out_fmt, cigarFile=cigar_file_, readLabel=read_label, npRead=npRead_, td=twoD_flag, t_model=template_model_flag, c_model=complement_model_flag, posteriors=posteriors_file_path, thresh=threshold_flag, expansion=diag_expansion_flag, trim=trim_flag, hdp=hdp_flags, degen=degenerate_flag, seq_name=guide_alignment.reference_name, f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag) # run print("signalAlign - running command: ", command, end="\n", file=sys.stderr) os.system(command) if self.embed: print("signalAlign - embedding into Fast5 ", file=sys.stderr) data = self.read_in_signal_align_tsv(posteriors_file_path, file_type=self.output_format) npRead = NanoporeRead(fast_five_file=self.in_fast5, twoD=self.twoD_chemistry, event_table=self.event_table) npRead.Initialize(None) signal_align_path = npRead.get_latest_basecall_edition( "/Analyses/SignalAlign_00{}", new=False) assert signal_align_path, "There is no path in Fast5 file: {}".format( "/Analyses/SignalAlign_00{}") output_path = npRead._join_path(signal_align_path, self.output_format) npRead.write_data(data, output_path) # Todo add attributes to signalalign output if self.output_format == "full": print("signalAlign - writing maximum expected alignment ", file=sys.stderr) alignment = mea_alignment_from_signal_align(None, events=data) mae_path = npRead._join_path(signal_align_path, "MEA_alignment_labels") events = npRead.get_template_events() if events: if guide_alignment.strand == "-": minus = True else: minus = False labels = match_events_with_signalalign( sa_events=alignment, event_detections=np.asanyarray(npRead.template_events), minus=minus, rna=npRead.is_read_rna()) npRead.write_data(labels, mae_path) sam_string = str() with open(temp_samfile_, 'r') as test: for line in test: sam_string += line sam_path = npRead._join_path(signal_align_path, "sam") # print(sam_string) npRead.write_data(data=sam_string, location=sam_path, compression=None) # self.temp_folder.remove_folder() return True def prepare_oned(self, nanopore_read, oned_read_path): try: read_file = open(oned_read_path, "w") fastaWrite(fileHandleOrFile=read_file, name=nanopore_read.read_label, seq=nanopore_read.template_read) version = nanopore_read.version read_file.close() return True, version, False except Exception as e: return False, None, False def prepare_twod(self, nanopore_read, twod_read_path): # check for table to make 'assembled' 2D alignment table fasta with if nanopore_read.has2D_alignment_table is False: nanopore_read.close() return False, None, False fasta_handle = open(twod_read_path, "w") fastaWrite(fileHandleOrFile=fasta_handle, name=nanopore_read.read_label, seq=nanopore_read.alignment_table_sequence) if nanopore_read.complement_model_id == "complement_median68pA_pop1.model": pop1_complement = True else: pop1_complement = False version = nanopore_read.version fasta_handle.close() nanopore_read.close() return True, version, pop1_complement def openTempFolder(self, temp_dir): self.temp_folder.open_folder("%s%s" % (self.destination, temp_dir)) def addTempFilePath(self, path_to_add): return self.temp_folder.add_file_path(path_to_add) def failStop(self, message, nanopore_read=None): self.temp_folder.remove_folder() if nanopore_read is not None: nanopore_read.close() print(message, file=sys.stderr) def read_in_signal_align_tsv(self, tsv_path, file_type): """Read in tsv file""" assert file_type in ("full", "assignments", "variantCaller") with open(tsv_path, 'r') as tsvin: if file_type == "full": dtype = [('contig', 'S10'), ('reference_index', int), ('reference_kmer', 'S5'), ('read_file', 'S57'), ('strand', 'S1'), ('event_index', int), ('event_mean', float), ('event_noise', float), ('event_duration', float), ('aligned_kmer', 'S5'), ('scaled_mean_current', float), ('scaled_noise', float), ('posterior_probability', float), ('descaled_event_mean', float), ('ont_model_mean', float), ('path_kmer', 'S5')] elif file_type == "assignments": dtype = [('k-mer', 'S10'), ('read_file', 'S57'), ('descaled_event_mean', float), ('posterior_probability', float)] else: dtype = [('event_index', int), ('reference_position', int), ('base', 'S6'), ('posterior_probability', float), ('strand', 'S1'), ('forward_mapped', int), ('read_file', 'S57')] event_table = np.loadtxt(tsvin, dtype=dtype) def remove_field_name(a, name): names = list(a.dtype.names) if name in names: names.remove(name) b = a[names] return b event_table = remove_field_name(event_table, "read_file") return event_table