Пример #1
0
def main(config=None):
    """Plot event to reference labelled ONT nanopore reads"""
    start = timer()
    if config is None:
        args = parse_args()
        # load model files
        assert os.path.exists(
            args.config), "Config file does not exist: {}".format(args.config)
        config = load_json(args.config)

    args = create_dot_dict(config)
    # get assignments and load model
    try:
        assignments = parse_assignment_file(args.assignments)
    except ValueError:
        assignments = parse_alignment_file(args.assignments)

    model_h = HmmModel(args.model_path, rna=args.rna)
    target_model = None
    if args.target_hmm_model is not None:
        target_model = HmmModel(args.target_hmm_model,
                                hdp_model_file=args.target_hdp_model,
                                rna=args.rna)
    # generate kmers to match
    all_kmer_pairs = set()
    for motif in args.motifs:
        all_kmer_pairs |= set(
            tuple(row) for row in get_motif_kmer_pairs(motif_pair=motif,
                                                       k=model_h.kmer_length))

    data = generate_gaussian_mixture_model_for_motifs(
        model_h,
        assignments,
        all_kmer_pairs,
        args.strand,
        args.output_dir,
        plot=args.plot,
        name="ccwgg",
        target_model=target_model,
        show=args.show)
    # data = pd.read_csv(os.path.join(args.output_dir, "t_distances.tsv"), delimiter="\t")
    # data = data.ix[0]
    # plot_mixture_model_distribution(data["kmer"], data["canonical_model_mean"], data["canonical_model_sd"],
    #                                 data["canonical_mixture_mean"],
    #                                 data["canonical_mixture_sd"], data["modified_mixture_mean"],
    #                                 data["modified_mixture_sd"],
    #                                 data["strand"], kmer_assignments=assignments, save_fig_dir=None)
    stop = timer()
    print("Running Time = {} seconds".format(stop - start), file=sys.stderr)
Пример #2
0
    def setUpClass(cls):
        super(BandedAlignmentTests, cls).setUpClass()
        cls.HOME = '/'.join(os.path.abspath(__file__).split("/")[:-4])
        cls.rna_model_file = os.path.join(
            cls.HOME, "models/testModelR9p4_5mer_acgt_RNA.model")
        cls.dna_template_model_file = os.path.join(
            cls.HOME, "models/testModelR9p4_5mer_acegt_template.model")
        cls.rna_model = HmmModel(ont_model_file=cls.rna_model_file)
        cls.dna_model = HmmModel(ont_model_file=cls.dna_template_model_file)

        cls.tmp_directory = tempfile.mkdtemp()

        # clear line for output
        print("")
        print("", file=sys.stderr)
Пример #3
0
def main(config=None):
    if config is None:
        args = parse_args()
        # load model files
        assert os.path.exists(
            args.config), "Config file does not exist: {}".format(args.config)
        config = load_json(args.config)

    args = create_dot_dict(config)
    # load model files
    models = []
    kmer_lists = []
    assignment_data = []
    strands = []
    max_plots = 0
    # create models and grab kmer lists
    for model in args.models:
        models.append(
            HmmModel(ont_model_file=model.ont_model,
                     hdp_model_file=model.hdp_model,
                     nanopolish_model_file=model.nanopolish_model,
                     rna=model.rna,
                     name=model.name))
        model_kmer_list = model.kmers
        n_kmers_to_plot = len(model_kmer_list)
        kmer_lists.append(model_kmer_list)
        max_plots = n_kmers_to_plot if n_kmers_to_plot > max_plots else max_plots

        if model.builtAlignment_tsv is not None:
            assert os.path.exists(model.builtAlignment_tsv), \
                "builtAlignment_tsv does not exist: {}".format(model.builtAlignment_tsv)
            # read in both types of data
            try:
                assignment_data.append(
                    parse_assignment_file(model.builtAlignment_tsv))
            except ValueError:
                assignment_data.append(
                    parse_alignment_file(model.builtAlignment_tsv))
        else:
            assignment_data.append(None)
        strands.append(model.strand)

    mmh = MultipleModelHandler(models,
                               strands=strands,
                               assignment_data=assignment_data,
                               savefig_dir=args.save_fig_dir)
    if args.summary_distance:
        mmh.plot_all_model_comparisons()
    # Start plotting
    for kmer_list in zip_longest(*kmer_lists):
        mmh.plot_kmer_distribution(kmer_list)

    if args.save_fig_dir:
        save_json(
            args,
            os.path.join(args.save_fig_dir,
                         "compare_trained_models_config.json"))
Пример #4
0
def split_assignment_file(assignment_file_path,
                          my_dirs,
                          alphabet,
                          kmer_length,
                          alphabet_size,
                          min_prob=0.0,
                          alignment=False,
                          remove=False):
    """Split kmers and write to new files
    :param assignment_file_path: path to assignment file
    :param my_dirs: list of directories to write
    :param alphabet: kmer alphabet
    :param kmer_length: kmer length
    :param alphabet_size: size of alphabet
    :param min_prob: minimum probability
    :param alignment: if set will select columns from an alignment file with 14 columns
    """
    basename = os.path.basename(assignment_file_path)
    data = [[] for _ in range((alphabet_size**kmer_length) * 2)]
    if alignment:
        prob_index = 12
        kmer_index = 9
        strand_index = 4
        mean_index = 13
    else:
        kmer_index = 0
        strand_index = 1
        mean_index = 2
        prob_index = 3

    with open(assignment_file_path, "r") as fh:
        for line in fh:
            split_line = line.split()
            if float(split_line[prob_index]) >= min_prob:
                outline = [
                    split_line[kmer_index], split_line[strand_index],
                    split_line[mean_index], split_line[prob_index]
                ]
                k_index = HmmModel._get_kmer_index(split_line[kmer_index],
                                                   alphabet, kmer_length,
                                                   alphabet_size)

                if split_line[strand_index] == "c":
                    data[k_index +
                         (alphabet_size**kmer_length)].append(outline)
                else:
                    data[k_index].append(outline)

    for directory, kmer_data in zip(my_dirs, data):
        if len(kmer_data) > 0:
            with open(os.path.join(directory, basename), "w") as fh2:
                for line in kmer_data:
                    fh2.write("\t".join(line) + "\n")
    if remove:
        os.remove(assignment_file_path)
    return True
Пример #5
0
def main():
    # fast5_path = "/Users/andrewbailey/CLionProjects/nanopolish/ucsc_run5_20170922_directRNA/ucsc_run5_20170922_directRNA_fast5/DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_744_ch_111_strand.fast5"
    # nucleotide_seq = "CCUGAAAGCAAUACCUGAUGGAGGCAGCAACAAAGUGUUCCUGGCCAAGUAACCUCGAGAGCUACUUUGACCGUCUGUCUAUCAGGAUGAGAUCGCUGGUGCAUUGAAGGCCUACGAGAAAAUUUUACUGAGGCCACCCAGAACUUCAACACCAAAAGAUGACAGACUACGCCAAGAGGUGAGUGUCCUGGGCCCAACAACUACGGAUAGUUUUUGCCAGCCAGCAGAAGCCGGACACCAUUCCCACAGAACUGGCCAAACGGGUUCGAGUUAUGCCGGCAGCUGGAGAUGAAACCGAUCGUCUGAGCCCCGGGCACUGGUGGGCGGGCAGGGUCUACAAACAGUUCCGCAAGGUCCAAAGGUGGACGUCCAUCCUAAAGCCAAGC"
    # TODO RNA model files are 3' to 5' but in nanopolish they are 5' to 3'

    # fast5_path = "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/minion-reads/rna_reads/one_rna/DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_36_ch_218_strand.fast5"
    # nucleotide_seq = "UGCACAUUUAGCAUUGGCUGCUGUUGGUUUAGGCCGUCCGAACCUGGGCAGGAAGAUGUGGCCGCAAAGAAGACGAAAAGUCUGGAGUCGAUCAACUCUAGGCCUUAUUGCUAUGAAAGUGGGAAGUACGUCCUGGGUACAAGCAGACUCUGAAGAUGAUGAUCAGACACCAAGGGCAAGCGAAAUUGGUCAUUCGCUAACAACUGCCCAGCUUUGAGGAAAAUCUGAAAUAGAGUACUAUGUUAUGUUGGCUAAAACUGGUGCACAUCACUACGGUGGCAAUAAUAUUGAACUGCUGGGCACAGCAUCGGAAAGCUACUACAGAGUGCGCCAUUGGCUAUCAUUGAUCCAGGGGUGACUUGACCAUUAGAAGCUGCCAGAAAGACUGGUGAAAGUAAACCACACAAAAUUUUCAGCAAACUUCUAAACCUGCAUAAAAAUUCUUUAAUAAAUUCUGCUUGUUAAAAUUCCUCCAUCCUCCAUUCAUCCAUAUUAUCAUAUCAUAUCCCUUACCUAUCCUACAAAAUCCAA"
    #
    # nucleotide_seq_3_to_5 = nucleotide_seq[::-1].replace("U", "T")
    # nucleotide_seq_3_to_5 = nucleotide_seq.replace("U", "T")

    # create Fast5 object
    # RNA_MINKNOW = dict(window_lengths=(7, 14), thresholds=(2.5, 9.0), peak_height=1.0)
    # event_table, f5fh = create_minknow_events_from_fast5(fast5_path, **RNA_MINKNOW)
    # fast5_path = "/Users/andrewbailey/CLionProjects/nanopore-RNN/submodules/signalAlign/tests/minion_test_reads/canonical_ecoli_R9/miten_PC_20160820_FNFAD20259_MN17223_mux_scan_AMS_158_R9_WGA_Ecoli_08_20_16_83098_ch138_read23_strand.fast5"
    #
    # model_file = "/Users/andrewbailey/CLionProjects/nanopore-RNN/submodules/signalAlign/models/testModelR9p4_5mer_acgt_RNA.model"

    # print(event_table)
    # fastq = f5fh.get_fastq(analysis="Basecall_1D", section="template")
    # sequence = fastq.split()[1]
    #
    # model_types = ["threeState", "threeStateHdp"]
    # model = HmmModel(model_types[0], model_file)
    # create events
    # events, sum_emission = simple_banded_event_align(event_table, model, nucleotide_seq_3_to_5)
    # events, sum_emission = simple_banded_event_align(event_table, model, nucleotide_seq_3_to_5)

    # embed
    # name = "SimpleBandedAlignment_00{}"
    # keys = ["log(total_probability)", "time_stamp"]
    # values = [sum_emission, TimeStamp().posix_date()]
    # attributes = dict(zip(keys, values))
    # # f5fh = Fast5(fast5_path, read='r+')
    # f5fh.set_new_event_table(name, events, attributes, overwrite=False)

    #DNA
    model_file = "/Users/andrewbailey/CLionProjects/nanopore-RNN/submodules/signalAlign/models/testModelR9_5mer_acgt_template.model"
    model_types = ["threeState", "threeStateHdp"]
    model = HmmModel(model_file)

    fast5_path = "/Users/andrewbailey/CLionProjects/nanopore-RNN/submodules/signalAlign/tests/minion_test_reads/1D/LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch92_read1108_strand.fast5"
    event_table, f5fh = create_minknow_events_from_fast5(fast5_path)

    fastq = f5fh.get_fastq(analysis="Basecall_1D", section="template")
    nucleotide_seq = fastq.split('\n')[1]

    name = "AdaptiveBandedAlignment_00{}"
    events, sum_emission = adaptive_banded_simple_event_align(event_table[:30], model, nucleotide_seq[:25], debug=False)
    # embed
    keys = ["log(total_probability)", "time_stamp"]
    values = [sum_emission, TimeStamp().posix_date()]
    attributes = dict(zip(keys, values))
    f5fh.set_new_event_table(name, events, attributes, overwrite=True)
Пример #6
0
def main():
    args = parse_args()
    # load model files
    model_handle = HmmModel(args.ont_model, args.hdp_model, rna=args.rna)

    alignment_data = None
    if args.build_alignment_file:
        with open(args.build_alignment_file, 'r') as fh:
            n_cols = len(fh.readline().split())
            if n_cols == 4:
                alignment_data = parse_assignment_file(
                    args.build_alignment_file)
            if n_cols == 15:
                alignment_data = parse_alignment_file(
                    args.build_alignment_file)

    # output a single plot or one for each kmer
    if args.kmer:
        model_handle.plot_kmer_distribution(args.kmer,
                                            alignment_file_data=alignment_data,
                                            savefig_dir=args.output_dir)
    else:
        assert args.all_kmers, "Must pick a single kmer to plot using --kmer or pass the flag --all_kmers"
        for kmer in model_handle.sorted_kmer_tuple:
            model_handle.plot_kmer_distribution(
                kmer,
                alignment_file_data=alignment_data,
                savefig_dir=args.output_dir)
def main():
    args = parse_args()
    print(args)
    assert os.path.isdir(args.dir), "{} is not a directory".format(args.dir)
    assert os.path.isdir(args.output_dir), "{} is not a directory".format(
        args.output_dir)
    assert os.path.exists(args.base_model), "{} does not exist".format(
        args.base_model)
    csv_files = list_dir(args.dir, ext="csv")
    model = HmmModel(args.base_model, rna=args.rna)
    transition_probs = model.transitions

    extra_args = {
        "output_dir": args.output_dir,
        "transition_probs": transition_probs,
        "state_number": 3,
        "rna": args.rna
    }
    service = BasicService(convert_csv_to_sa_model,
                           service_name="multiprocess_convert_csv_to_sa_model")
    total, failure, messages, output = run_service(service.run, csv_files,
                                                   extra_args, ["csv_file"],
                                                   args.num_threads)
Пример #8
0
 def load_hmm_models(self):
     """Load in the correct models depending on what is going to be trained. """
     # load template model
     assert self.template_hmm_model_path, "Missing template model %s" % (
         self.template_hmm_model_path)
     self.template_hmm_model_path = os.path.abspath(
         self.template_hmm_model_path)
     self.template_model = HmmModel(self.template_hmm_model_path)
     new_template_hmm = self.working_folder.add_file_path(
         "template_trained.hmm")
     copyfile(self.template_hmm_model_path, new_template_hmm)
     assert os.path.exists(
         new_template_hmm), "Problem copying default model to {}".format(
             new_template_hmm)
     self.template_hmm_model_path = new_template_hmm
     # set alphabet and kmer_length
     self.kmer_length = self.template_model.kmer_length
     self.alphabet = self.template_model.alphabet
     # load complement model if 2D
     if self.two_d:
         assert self.complement_hmm_model_path, "Missing complement model: {}".format(
             self.complement_hmm_model_path)
         self.complement_hmm_model_path = os.path.abspath(
             self.complement_hmm_model_path)
         self.complement_model = HmmModel(self.complement_hmm_model_path)
         new_complement_hmm = self.working_folder.add_file_path(
             "complement_trained.hmm")
         copyfile(self.complement_hmm_model_path, new_complement_hmm)
         assert os.path.exists(
             new_complement_hmm
         ), "Problem copying default model to {}".format(new_complement_hmm)
         self.complement_hmm_model_path = new_complement_hmm
         # make sure models match
         assert self.complement_model.kmer_length == self.template_model.kmer_length, \
             "Template model and complement model kmer lengths do not match." \
             " template: {} != complement: {}".format(self.complement_model.kmer_length,
                                                      self.template_model.kmer_length)
         assert self.complement_model.alphabet == self.template_model.alphabet, \
             "Template model and complement model alphabets do not match." \
             " template: {} != complement: {}".format(self.complement_model.alphabet,
                                                      self.template_model.alphabet)
     # get the input HDP models, if they can be found
     if self.template_hdp_model_path:
         self.state_machine_type = "threeStateHdp"
         assert os.path.exists(self.template_hdp_model_path), \
             "Template HDP path not found {}".format(self.template_hdp_model_path)
         self.template_hdp_model_path = os.path.abspath(
             self.template_hdp_model_path)
         new_template_hdp = self.working_folder.add_file_path("{}".format(
             os.path.basename(self.template_hdp_model_path)))
         copyfile(self.template_hdp_model_path, new_template_hdp)
         self.complement_hdp_model_path = new_template_hdp
     # same for complement hdp
     if self.complement_hdp_model_path and self.two_d:
         assert os.path.exists(self.complement_hdp_model_path), \
             "Complement HDP path not found {}".format(self.complement_hdp_model_path)
         self.complement_hdp_model_path = os.path.abspath(
             self.complement_hdp_model_path)
         new_complement_hdp = \
             self.working_folder.add_file_path("{}".format(os.path.basename(self.complement_hdp_model_path)))
         copyfile(self.complement_hdp_model_path, new_complement_hdp)
         self.complement_hdp_model_path = new_complement_hdp
Пример #9
0
    def setUpClass(cls):
        super(TrainSignalAlignTest, cls).setUpClass()
        cls.HOME = '/'.join(os.path.abspath(__file__).split("/")[:-4])
        cls.reference = os.path.join(
            cls.HOME, "tests/test_sequences/pUC19_SspI_Zymo.fa")
        cls.ecoli_reference = os.path.join(
            cls.HOME, "tests/test_sequences/E.coli_K12.fasta")

        cls.fast5_dir = os.path.join(
            cls.HOME, "tests/minion_test_reads/canonical_ecoli_R9")
        cls.files = [
            "miten_PC_20160820_FNFAD20259_MN17223_mux_scan_AMS_158_R9_WGA_Ecoli_08_20_16_83098_ch138_read23_strand.fast5",
            "miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch101_read456_strand.fast5",
            "miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch101_read544_strand1.fast5",
            "miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch103_read333_strand1.fast5"
        ]
        cls.fast5_paths = [
            os.path.join(cls.fast5_dir, f) for f in os.listdir(cls.fast5_dir)
            if os.path.isfile(os.path.join(cls.fast5_dir, f))
        ]
        cls.model_file = os.path.join(
            cls.HOME, "models/testModelR9p4_5mer_acgt_RNA.model")
        cls.r9_complement_model_file = os.path.join(
            cls.HOME, "models/testModelR9_acegt_complement.model")
        cls.r9_template_model_file = os.path.join(
            cls.HOME, "models/testModelR9_acegt_template.model")

        cls.model = HmmModel(model_file=cls.model_file)
        cls.expectation_file = os.path.join(
            cls.HOME,
            "tests/test_expectation_files/4f9a316c-8bb3-410a-8cfc-026061f7e8db.template.expectations.tsv"
        )
        cls.assignment_file = os.path.join(
            cls.HOME,
            "tests/test_assignment_files/d6160b0b-a35e-43b5-947f-adaa1abade28.sm.assignments.tsv"
        )

        cls.path_to_bin = os.path.join(cls.HOME, "bin")
        cls.hdp_types = {
            "singleLevelFixed": 0,
            "singleLevelPrior": 1,
            "multisetFixed": 2,
            "multisetPrior": 3,
            "compFixed": 4,
            "compPrior": 5,
            "middleNtsFixed": 6,
            "middleNtsPrior": 7,
            "groupMultisetFixed": 8,
            "groupMultisetPrior": 9,
            "singleLevelPrior2": 10,
            "multisetPrior2": 11,
            "multisetPriorEcoli": 12,
            "singleLevelPriorEcoli": 13,
            "singleLevelFixedCanonical": 14
        }
        cls.test_hdp_training_data = os.path.join(
            cls.HOME, "tests/test_hdp/test_hdp_alignment.txt")
        cls.one_file_dir = os.path.join(
            cls.HOME, "tests/minion_test_reads/one_R9_canonical_ecoli")
        cls.config_file = os.path.join(cls.HOME,
                                       "tests/trainModels-config.json")
        cls.default_args = create_dot_dict(load_json(cls.config_file))
        cls.default_args.path_to_bin = cls.path_to_bin
        cls.default_args.output_dir = cls.path_to_bin
        cls.default_args.samples[0].fast5_dirs = [cls.one_file_dir]
        cls.default_args.samples[0].bwa_reference = cls.ecoli_reference
        cls.r9_complement_model_file_acgt = os.path.join(
            cls.HOME, "models/testModelR9_5mer_acgt_complement.model")
        cls.r9_template_model_file_acgt = os.path.join(
            cls.HOME, "models/testModelR9_5mer_acgt_template.model")

        cls.default_args.complement_hmm_model = cls.r9_complement_model_file_acgt
        cls.default_args.template_hmm_model = cls.r9_template_model_file_acgt