def main(config=None): """Plot event to reference labelled ONT nanopore reads""" start = timer() if config is None: args = parse_args() # load model files assert os.path.exists( args.config), "Config file does not exist: {}".format(args.config) config = load_json(args.config) args = create_dot_dict(config) # get assignments and load model try: assignments = parse_assignment_file(args.assignments) except ValueError: assignments = parse_alignment_file(args.assignments) model_h = HmmModel(args.model_path, rna=args.rna) target_model = None if args.target_hmm_model is not None: target_model = HmmModel(args.target_hmm_model, hdp_model_file=args.target_hdp_model, rna=args.rna) # generate kmers to match all_kmer_pairs = set() for motif in args.motifs: all_kmer_pairs |= set( tuple(row) for row in get_motif_kmer_pairs(motif_pair=motif, k=model_h.kmer_length)) data = generate_gaussian_mixture_model_for_motifs( model_h, assignments, all_kmer_pairs, args.strand, args.output_dir, plot=args.plot, name="ccwgg", target_model=target_model, show=args.show) # data = pd.read_csv(os.path.join(args.output_dir, "t_distances.tsv"), delimiter="\t") # data = data.ix[0] # plot_mixture_model_distribution(data["kmer"], data["canonical_model_mean"], data["canonical_model_sd"], # data["canonical_mixture_mean"], # data["canonical_mixture_sd"], data["modified_mixture_mean"], # data["modified_mixture_sd"], # data["strand"], kmer_assignments=assignments, save_fig_dir=None) stop = timer() print("Running Time = {} seconds".format(stop - start), file=sys.stderr)
def setUpClass(cls): super(BandedAlignmentTests, cls).setUpClass() cls.HOME = '/'.join(os.path.abspath(__file__).split("/")[:-4]) cls.rna_model_file = os.path.join( cls.HOME, "models/testModelR9p4_5mer_acgt_RNA.model") cls.dna_template_model_file = os.path.join( cls.HOME, "models/testModelR9p4_5mer_acegt_template.model") cls.rna_model = HmmModel(ont_model_file=cls.rna_model_file) cls.dna_model = HmmModel(ont_model_file=cls.dna_template_model_file) cls.tmp_directory = tempfile.mkdtemp() # clear line for output print("") print("", file=sys.stderr)
def main(config=None): if config is None: args = parse_args() # load model files assert os.path.exists( args.config), "Config file does not exist: {}".format(args.config) config = load_json(args.config) args = create_dot_dict(config) # load model files models = [] kmer_lists = [] assignment_data = [] strands = [] max_plots = 0 # create models and grab kmer lists for model in args.models: models.append( HmmModel(ont_model_file=model.ont_model, hdp_model_file=model.hdp_model, nanopolish_model_file=model.nanopolish_model, rna=model.rna, name=model.name)) model_kmer_list = model.kmers n_kmers_to_plot = len(model_kmer_list) kmer_lists.append(model_kmer_list) max_plots = n_kmers_to_plot if n_kmers_to_plot > max_plots else max_plots if model.builtAlignment_tsv is not None: assert os.path.exists(model.builtAlignment_tsv), \ "builtAlignment_tsv does not exist: {}".format(model.builtAlignment_tsv) # read in both types of data try: assignment_data.append( parse_assignment_file(model.builtAlignment_tsv)) except ValueError: assignment_data.append( parse_alignment_file(model.builtAlignment_tsv)) else: assignment_data.append(None) strands.append(model.strand) mmh = MultipleModelHandler(models, strands=strands, assignment_data=assignment_data, savefig_dir=args.save_fig_dir) if args.summary_distance: mmh.plot_all_model_comparisons() # Start plotting for kmer_list in zip_longest(*kmer_lists): mmh.plot_kmer_distribution(kmer_list) if args.save_fig_dir: save_json( args, os.path.join(args.save_fig_dir, "compare_trained_models_config.json"))
def split_assignment_file(assignment_file_path, my_dirs, alphabet, kmer_length, alphabet_size, min_prob=0.0, alignment=False, remove=False): """Split kmers and write to new files :param assignment_file_path: path to assignment file :param my_dirs: list of directories to write :param alphabet: kmer alphabet :param kmer_length: kmer length :param alphabet_size: size of alphabet :param min_prob: minimum probability :param alignment: if set will select columns from an alignment file with 14 columns """ basename = os.path.basename(assignment_file_path) data = [[] for _ in range((alphabet_size**kmer_length) * 2)] if alignment: prob_index = 12 kmer_index = 9 strand_index = 4 mean_index = 13 else: kmer_index = 0 strand_index = 1 mean_index = 2 prob_index = 3 with open(assignment_file_path, "r") as fh: for line in fh: split_line = line.split() if float(split_line[prob_index]) >= min_prob: outline = [ split_line[kmer_index], split_line[strand_index], split_line[mean_index], split_line[prob_index] ] k_index = HmmModel._get_kmer_index(split_line[kmer_index], alphabet, kmer_length, alphabet_size) if split_line[strand_index] == "c": data[k_index + (alphabet_size**kmer_length)].append(outline) else: data[k_index].append(outline) for directory, kmer_data in zip(my_dirs, data): if len(kmer_data) > 0: with open(os.path.join(directory, basename), "w") as fh2: for line in kmer_data: fh2.write("\t".join(line) + "\n") if remove: os.remove(assignment_file_path) return True
def main(): # fast5_path = "/Users/andrewbailey/CLionProjects/nanopolish/ucsc_run5_20170922_directRNA/ucsc_run5_20170922_directRNA_fast5/DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_744_ch_111_strand.fast5" # nucleotide_seq = "CCUGAAAGCAAUACCUGAUGGAGGCAGCAACAAAGUGUUCCUGGCCAAGUAACCUCGAGAGCUACUUUGACCGUCUGUCUAUCAGGAUGAGAUCGCUGGUGCAUUGAAGGCCUACGAGAAAAUUUUACUGAGGCCACCCAGAACUUCAACACCAAAAGAUGACAGACUACGCCAAGAGGUGAGUGUCCUGGGCCCAACAACUACGGAUAGUUUUUGCCAGCCAGCAGAAGCCGGACACCAUUCCCACAGAACUGGCCAAACGGGUUCGAGUUAUGCCGGCAGCUGGAGAUGAAACCGAUCGUCUGAGCCCCGGGCACUGGUGGGCGGGCAGGGUCUACAAACAGUUCCGCAAGGUCCAAAGGUGGACGUCCAUCCUAAAGCCAAGC" # TODO RNA model files are 3' to 5' but in nanopolish they are 5' to 3' # fast5_path = "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/minion-reads/rna_reads/one_rna/DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_36_ch_218_strand.fast5" # nucleotide_seq = "UGCACAUUUAGCAUUGGCUGCUGUUGGUUUAGGCCGUCCGAACCUGGGCAGGAAGAUGUGGCCGCAAAGAAGACGAAAAGUCUGGAGUCGAUCAACUCUAGGCCUUAUUGCUAUGAAAGUGGGAAGUACGUCCUGGGUACAAGCAGACUCUGAAGAUGAUGAUCAGACACCAAGGGCAAGCGAAAUUGGUCAUUCGCUAACAACUGCCCAGCUUUGAGGAAAAUCUGAAAUAGAGUACUAUGUUAUGUUGGCUAAAACUGGUGCACAUCACUACGGUGGCAAUAAUAUUGAACUGCUGGGCACAGCAUCGGAAAGCUACUACAGAGUGCGCCAUUGGCUAUCAUUGAUCCAGGGGUGACUUGACCAUUAGAAGCUGCCAGAAAGACUGGUGAAAGUAAACCACACAAAAUUUUCAGCAAACUUCUAAACCUGCAUAAAAAUUCUUUAAUAAAUUCUGCUUGUUAAAAUUCCUCCAUCCUCCAUUCAUCCAUAUUAUCAUAUCAUAUCCCUUACCUAUCCUACAAAAUCCAA" # # nucleotide_seq_3_to_5 = nucleotide_seq[::-1].replace("U", "T") # nucleotide_seq_3_to_5 = nucleotide_seq.replace("U", "T") # create Fast5 object # RNA_MINKNOW = dict(window_lengths=(7, 14), thresholds=(2.5, 9.0), peak_height=1.0) # event_table, f5fh = create_minknow_events_from_fast5(fast5_path, **RNA_MINKNOW) # fast5_path = "/Users/andrewbailey/CLionProjects/nanopore-RNN/submodules/signalAlign/tests/minion_test_reads/canonical_ecoli_R9/miten_PC_20160820_FNFAD20259_MN17223_mux_scan_AMS_158_R9_WGA_Ecoli_08_20_16_83098_ch138_read23_strand.fast5" # # model_file = "/Users/andrewbailey/CLionProjects/nanopore-RNN/submodules/signalAlign/models/testModelR9p4_5mer_acgt_RNA.model" # print(event_table) # fastq = f5fh.get_fastq(analysis="Basecall_1D", section="template") # sequence = fastq.split()[1] # # model_types = ["threeState", "threeStateHdp"] # model = HmmModel(model_types[0], model_file) # create events # events, sum_emission = simple_banded_event_align(event_table, model, nucleotide_seq_3_to_5) # events, sum_emission = simple_banded_event_align(event_table, model, nucleotide_seq_3_to_5) # embed # name = "SimpleBandedAlignment_00{}" # keys = ["log(total_probability)", "time_stamp"] # values = [sum_emission, TimeStamp().posix_date()] # attributes = dict(zip(keys, values)) # # f5fh = Fast5(fast5_path, read='r+') # f5fh.set_new_event_table(name, events, attributes, overwrite=False) #DNA model_file = "/Users/andrewbailey/CLionProjects/nanopore-RNN/submodules/signalAlign/models/testModelR9_5mer_acgt_template.model" model_types = ["threeState", "threeStateHdp"] model = HmmModel(model_file) fast5_path = "/Users/andrewbailey/CLionProjects/nanopore-RNN/submodules/signalAlign/tests/minion_test_reads/1D/LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch92_read1108_strand.fast5" event_table, f5fh = create_minknow_events_from_fast5(fast5_path) fastq = f5fh.get_fastq(analysis="Basecall_1D", section="template") nucleotide_seq = fastq.split('\n')[1] name = "AdaptiveBandedAlignment_00{}" events, sum_emission = adaptive_banded_simple_event_align(event_table[:30], model, nucleotide_seq[:25], debug=False) # embed keys = ["log(total_probability)", "time_stamp"] values = [sum_emission, TimeStamp().posix_date()] attributes = dict(zip(keys, values)) f5fh.set_new_event_table(name, events, attributes, overwrite=True)
def main(): args = parse_args() # load model files model_handle = HmmModel(args.ont_model, args.hdp_model, rna=args.rna) alignment_data = None if args.build_alignment_file: with open(args.build_alignment_file, 'r') as fh: n_cols = len(fh.readline().split()) if n_cols == 4: alignment_data = parse_assignment_file( args.build_alignment_file) if n_cols == 15: alignment_data = parse_alignment_file( args.build_alignment_file) # output a single plot or one for each kmer if args.kmer: model_handle.plot_kmer_distribution(args.kmer, alignment_file_data=alignment_data, savefig_dir=args.output_dir) else: assert args.all_kmers, "Must pick a single kmer to plot using --kmer or pass the flag --all_kmers" for kmer in model_handle.sorted_kmer_tuple: model_handle.plot_kmer_distribution( kmer, alignment_file_data=alignment_data, savefig_dir=args.output_dir)
def main(): args = parse_args() print(args) assert os.path.isdir(args.dir), "{} is not a directory".format(args.dir) assert os.path.isdir(args.output_dir), "{} is not a directory".format( args.output_dir) assert os.path.exists(args.base_model), "{} does not exist".format( args.base_model) csv_files = list_dir(args.dir, ext="csv") model = HmmModel(args.base_model, rna=args.rna) transition_probs = model.transitions extra_args = { "output_dir": args.output_dir, "transition_probs": transition_probs, "state_number": 3, "rna": args.rna } service = BasicService(convert_csv_to_sa_model, service_name="multiprocess_convert_csv_to_sa_model") total, failure, messages, output = run_service(service.run, csv_files, extra_args, ["csv_file"], args.num_threads)
def load_hmm_models(self): """Load in the correct models depending on what is going to be trained. """ # load template model assert self.template_hmm_model_path, "Missing template model %s" % ( self.template_hmm_model_path) self.template_hmm_model_path = os.path.abspath( self.template_hmm_model_path) self.template_model = HmmModel(self.template_hmm_model_path) new_template_hmm = self.working_folder.add_file_path( "template_trained.hmm") copyfile(self.template_hmm_model_path, new_template_hmm) assert os.path.exists( new_template_hmm), "Problem copying default model to {}".format( new_template_hmm) self.template_hmm_model_path = new_template_hmm # set alphabet and kmer_length self.kmer_length = self.template_model.kmer_length self.alphabet = self.template_model.alphabet # load complement model if 2D if self.two_d: assert self.complement_hmm_model_path, "Missing complement model: {}".format( self.complement_hmm_model_path) self.complement_hmm_model_path = os.path.abspath( self.complement_hmm_model_path) self.complement_model = HmmModel(self.complement_hmm_model_path) new_complement_hmm = self.working_folder.add_file_path( "complement_trained.hmm") copyfile(self.complement_hmm_model_path, new_complement_hmm) assert os.path.exists( new_complement_hmm ), "Problem copying default model to {}".format(new_complement_hmm) self.complement_hmm_model_path = new_complement_hmm # make sure models match assert self.complement_model.kmer_length == self.template_model.kmer_length, \ "Template model and complement model kmer lengths do not match." \ " template: {} != complement: {}".format(self.complement_model.kmer_length, self.template_model.kmer_length) assert self.complement_model.alphabet == self.template_model.alphabet, \ "Template model and complement model alphabets do not match." \ " template: {} != complement: {}".format(self.complement_model.alphabet, self.template_model.alphabet) # get the input HDP models, if they can be found if self.template_hdp_model_path: self.state_machine_type = "threeStateHdp" assert os.path.exists(self.template_hdp_model_path), \ "Template HDP path not found {}".format(self.template_hdp_model_path) self.template_hdp_model_path = os.path.abspath( self.template_hdp_model_path) new_template_hdp = self.working_folder.add_file_path("{}".format( os.path.basename(self.template_hdp_model_path))) copyfile(self.template_hdp_model_path, new_template_hdp) self.complement_hdp_model_path = new_template_hdp # same for complement hdp if self.complement_hdp_model_path and self.two_d: assert os.path.exists(self.complement_hdp_model_path), \ "Complement HDP path not found {}".format(self.complement_hdp_model_path) self.complement_hdp_model_path = os.path.abspath( self.complement_hdp_model_path) new_complement_hdp = \ self.working_folder.add_file_path("{}".format(os.path.basename(self.complement_hdp_model_path))) copyfile(self.complement_hdp_model_path, new_complement_hdp) self.complement_hdp_model_path = new_complement_hdp
def setUpClass(cls): super(TrainSignalAlignTest, cls).setUpClass() cls.HOME = '/'.join(os.path.abspath(__file__).split("/")[:-4]) cls.reference = os.path.join( cls.HOME, "tests/test_sequences/pUC19_SspI_Zymo.fa") cls.ecoli_reference = os.path.join( cls.HOME, "tests/test_sequences/E.coli_K12.fasta") cls.fast5_dir = os.path.join( cls.HOME, "tests/minion_test_reads/canonical_ecoli_R9") cls.files = [ "miten_PC_20160820_FNFAD20259_MN17223_mux_scan_AMS_158_R9_WGA_Ecoli_08_20_16_83098_ch138_read23_strand.fast5", "miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch101_read456_strand.fast5", "miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch101_read544_strand1.fast5", "miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch103_read333_strand1.fast5" ] cls.fast5_paths = [ os.path.join(cls.fast5_dir, f) for f in os.listdir(cls.fast5_dir) if os.path.isfile(os.path.join(cls.fast5_dir, f)) ] cls.model_file = os.path.join( cls.HOME, "models/testModelR9p4_5mer_acgt_RNA.model") cls.r9_complement_model_file = os.path.join( cls.HOME, "models/testModelR9_acegt_complement.model") cls.r9_template_model_file = os.path.join( cls.HOME, "models/testModelR9_acegt_template.model") cls.model = HmmModel(model_file=cls.model_file) cls.expectation_file = os.path.join( cls.HOME, "tests/test_expectation_files/4f9a316c-8bb3-410a-8cfc-026061f7e8db.template.expectations.tsv" ) cls.assignment_file = os.path.join( cls.HOME, "tests/test_assignment_files/d6160b0b-a35e-43b5-947f-adaa1abade28.sm.assignments.tsv" ) cls.path_to_bin = os.path.join(cls.HOME, "bin") cls.hdp_types = { "singleLevelFixed": 0, "singleLevelPrior": 1, "multisetFixed": 2, "multisetPrior": 3, "compFixed": 4, "compPrior": 5, "middleNtsFixed": 6, "middleNtsPrior": 7, "groupMultisetFixed": 8, "groupMultisetPrior": 9, "singleLevelPrior2": 10, "multisetPrior2": 11, "multisetPriorEcoli": 12, "singleLevelPriorEcoli": 13, "singleLevelFixedCanonical": 14 } cls.test_hdp_training_data = os.path.join( cls.HOME, "tests/test_hdp/test_hdp_alignment.txt") cls.one_file_dir = os.path.join( cls.HOME, "tests/minion_test_reads/one_R9_canonical_ecoli") cls.config_file = os.path.join(cls.HOME, "tests/trainModels-config.json") cls.default_args = create_dot_dict(load_json(cls.config_file)) cls.default_args.path_to_bin = cls.path_to_bin cls.default_args.output_dir = cls.path_to_bin cls.default_args.samples[0].fast5_dirs = [cls.one_file_dir] cls.default_args.samples[0].bwa_reference = cls.ecoli_reference cls.r9_complement_model_file_acgt = os.path.join( cls.HOME, "models/testModelR9_5mer_acgt_complement.model") cls.r9_template_model_file_acgt = os.path.join( cls.HOME, "models/testModelR9_5mer_acgt_template.model") cls.default_args.complement_hmm_model = cls.r9_complement_model_file_acgt cls.default_args.template_hmm_model = cls.r9_template_model_file_acgt