예제 #1
0
def aligner(work_queue, done_queue):
    try:
        for f in iter(work_queue.get, 'STOP'):
            alignment = SignalAlignment(**f)
            alignment.run()
    except Exception as e:
        done_queue.put("%s failed with %s" % (current_process().name, e))
예제 #2
0
def get_expectations(work_queue, done_queue):
    try:
        for f in iter(work_queue.get, 'STOP'):
            alignment = SignalAlignment(**f)
            alignment.run(get_expectations=True)
    except Exception, e:
        done_queue.put("%s failed with %s" %
                       (current_process().name, e.message))
예제 #3
0
    def _aggregate_all_variantcalls(self):
        """Aggregate all the variant calls"""
        for v_tsv in self.variant_tsvs:
            if os.stat(v_tsv).st_size == 0:
                continue
            read_name = os.path.basename(v_tsv)
            variant_data = SignalAlignment.read_in_signal_align_tsv(v_tsv, "variantCaller")
            mv_h = MarginalizeVariants(variant_data, variants=self.variants, read_name=read_name)
            mv_h.get_data()
            if self.verbose:
                print(v_tsv)
            self.per_position_data = self.per_position_data.append(mv_h.position_probs, ignore_index=True)
            self.per_read_data = self.per_read_data.append(mv_h.per_read_calls, ignore_index=True)

        return True
예제 #4
0
def main(args):
    # parse args
    args = parse_args()

    command_line = " ".join(sys.argv[:])
    print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr)

    # get absolute paths to inputs
    args.files_dir           = resolvePath(args.files_dir)
    args.ref                 = resolvePath(args.ref)
    args.out                 = resolvePath(args.out)
    args.bwt                 = resolvePath(args.bwt)
    args.in_T_Hmm            = resolvePath(args.in_T_Hmm)
    args.in_C_Hmm            = resolvePath(args.in_C_Hmm)
    args.templateHDP         = resolvePath(args.templateHDP)
    args.complementHDP       = resolvePath(args.complementHDP)
    args.fofn                = resolvePath(args.fofn)
    args.target_regions      = resolvePath(args.target_regions)
    args.ambiguity_positions = resolvePath(args.ambiguity_positions)
    start_message = """
#   Starting Signal Align
#   Aligning files from: {fileDir}
#   Aligning to reference: {reference}
#   Aligning maximum of {nbFiles} files
#   Using model: {model}
#   Using banding: True
#   Aligning to regions in: {regions}
#   Non-default template HMM: {inThmm}
#   Non-default complement HMM: {inChmm}
#   Template HDP: {tHdp}
#   Complement HDP: {cHdp}
    """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files,
               inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions,
               tHdp=args.templateHDP, cHdp=args.complementHDP)

    print(start_message, file=sys.stdout)

    if args.files_dir is None and args.fofn is None:
        print("Need to provide directory with .fast5 files of fofn", file=sys.stderr)
        sys.exit(1)

    if not os.path.isfile(args.ref):
        print("Did not find valid reference file, looked for it {here}".format(here=args.ref), file=sys.stderr)
        sys.exit(1)

    # make directory to put temporary files
    temp_folder = FolderHandler()
    temp_dir_path = temp_folder.open_folder(args.out + "/tempFiles_alignment")
    #
    forward_reference, backward_reference = processReferenceFasta(fasta=args.ref,
                                                                      motif_key=args.motif_key,
                                                                      work_folder=temp_folder,
                                                                      sub_char=args.ambig_char,
                                                                      positions_file=args.ambiguity_positions)

    # index the reference for bwa
    if args.bwt is not None:
        print("[RunSignalAlign]NOTICE - using provided BWT %s" % args.bwt)
        bwa_ref_index = args.bwt
    else:
        print("signalAlign - indexing reference", file=sys.stderr)
        bwa_ref_index = getBwaIndex(args.ref, temp_dir_path)
        print("signalAlign - indexing reference, done", file=sys.stderr)

    # setup workers for multiprocessing
    workers = args.nb_jobs
    work_queue = Manager().Queue()
    done_queue = Manager().Queue()
    jobs = []

    # list of read files
    if args.fofn is not None:
        fast5s = [x for x in parseFofn(args.fofn) if x.endswith(".fast5")]
    else:
        fast5s = ["/".join([args.files_dir, x]) for x in os.listdir(args.files_dir) if x.endswith(".fast5")]

    nb_files = args.nb_files
    if nb_files < len(fast5s):
        shuffle(fast5s)
        fast5s = fast5s[:nb_files]

    # change paths to the source directory
    os.chdir(signalAlignSourceDir())

    print("[runSignalAlign]:NOTICE: Got {} files to align".format(len(fast5s)), file=sys.stdout)
    for fast5 in fast5s:
        print(fast5)
        alignment_args = {
            "destination": temp_dir_path,
            "stateMachineType": args.stateMachineType,
            "bwa_index": bwa_ref_index,
            "in_templateHmm": args.in_T_Hmm,
            "in_complementHmm": args.in_C_Hmm,
            "in_templateHdp": args.templateHDP,
            "in_complementHdp": args.complementHDP,
            "output_format": args.outFmt,
            "in_fast5": fast5,
            "threshold": args.threshold,
            "diagonal_expansion": args.diag_expansion,
            "constraint_trim": args.constraint_trim,
            "degenerate": getDegenerateEnum(args.degenerate),
            "twoD_chemistry": args.twoD,
            "target_regions": args.target_regions,
            "embed": args.embed,
            "event_table": args.event_table,
            "backward_reference": backward_reference,
            "forward_reference": forward_reference
        }
        if args.DEBUG:
            alignment = SignalAlignment(**alignment_args)
            alignment.run()
        else:
            work_queue.put(alignment_args)

    for w in range(workers):
        p = Process(target=aligner, args=(work_queue, done_queue))
        p.start()
        jobs.append(p)
        work_queue.put('STOP')

    for p in jobs:
        p.join()

    done_queue.put('STOP')
    print("\n#  signalAlign - finished alignments\n", file=sys.stderr)
    print("\n#  signalAlign - finished alignments\n", file=sys.stdout)
예제 #5
0
    def setUpClass(cls):
        super(CreateLabelsTest, cls).setUpClass()
        cls.HOME = '/'.join(os.path.abspath(__file__).split("/")[:-4])
        cls.fasta = os.path.join(cls.HOME,
                                 "tests/test_sequences/E.coli_K12.fasta")
        dna_file = os.path.join(cls.HOME,
                                "tests/minion_test_reads/1D/LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch112_read108_strand.fast5")
        rev_dna_file = os.path.join(cls.HOME,
                                    "tests/minion_test_reads/1D/LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch6_read347_strand.fast5")
        rev_rna_file = os.path.join(cls.HOME,
                                "tests/minion_test_reads/RNA_no_events/DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_61_ch_151_strand.fast5")
        forward_rna_file = os.path.join(cls.HOME,
                                "tests/minion_test_reads/RNA_no_events/DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_36_ch_218_strand.fast5")

        rna_reference = os.path.join(cls.HOME, "tests/test_sequences/fake_rna_ref.fa")
        ecoli_dna_reference = os.path.join(cls.HOME, "tests/test_sequences/E.coli_K12.fasta")
        cls.dna_reference_handle = pysam.FastaFile(ecoli_dna_reference)
        cls.rna_reference_handle = pysam.FastaFile(rna_reference)
        cls.tmp_directory = tempfile.mkdtemp()

         # get file locations
        cls.tmp_dna_file = os.path.join(str(cls.tmp_directory), 'test_dna.fast5')
        cls.tmp_dna_file2 = os.path.join(str(cls.tmp_directory), 'test_dna2.fast5')

        cls.tmp_rna_file1 = os.path.join(str(cls.tmp_directory), 'test_rna.fast5')
        cls.tmp_rna_file2 = os.path.join(str(cls.tmp_directory), 'test_rna2.fast5')

        # run signalAlign on one file
        cls.rna_model_file = os.path.join(cls.HOME, "models/testModelR9p4_5mer_acgt_RNA.model")
        cls.dna_model_file_94 = os.path.join(cls.HOME, "models/testModelR9p4_5mer_acegt_template.model")
        cls.rna_sam = os.path.join(cls.HOME, "tests/minion_test_reads/RNA_edge_cases/rna_reads.bam")
        cls.dna_sam = os.path.join(cls.HOME, "tests/minion_test_reads/oneD.bam")
        cls.bin_path = os.path.join(cls.HOME, "bin")
        # kmer index
        cls.kmer_index = 2

        # copy file to tmp directory
        shutil.copy(dna_file, cls.tmp_dna_file)
        shutil.copy(rev_dna_file, cls.tmp_dna_file2)

        shutil.copy(forward_rna_file, cls.tmp_rna_file1)
        shutil.copy(rev_rna_file, cls.tmp_rna_file2)

        args = create_signalAlignment_args(destination=cls.tmp_directory,
                                           in_templateHmm=cls.rna_model_file,
                                           alignment_file=cls.rna_sam,
                                           forward_reference=rna_reference,
                                           embed=True,
                                           path_to_bin=cls.bin_path,
                                           diagonal_expansion=5,
                                           delete_tmp=False)
        sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_rna_file1}]))
        sa_h.run()

        sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_rna_file2}]))
        sa_h.run()

        args = create_signalAlignment_args(destination=cls.tmp_directory,
                                           in_templateHmm=cls.dna_model_file_94,
                                           alignment_file=cls.dna_sam,
                                           forward_reference=ecoli_dna_reference,
                                           embed=True,
                                           path_to_bin=cls.bin_path,
                                           diagonal_expansion=10,
                                           traceBackDiagonals=100,
                                           constraint_trim=3)
        sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_dna_file}]))
        sa_h.run()

        sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_dna_file2}]))
        sa_h.run()

        cls.dna_handle = CreateLabels(cls.tmp_dna_file, kmer_index=cls.kmer_index)
        cls.dna_handle2 = CreateLabels(cls.tmp_dna_file2, kmer_index=cls.kmer_index)

        cls.rna1_handle = CreateLabels(cls.tmp_rna_file1, kmer_index=cls.kmer_index)
        cls.rna2_handle = CreateLabels(cls.tmp_rna_file2, kmer_index=cls.kmer_index)
        cls.rev_comp = ReverseComplement()

        cls.tmp_dna_file3 = os.path.join(cls.HOME,
                                         "tests/minion_test_reads/embedded_files/miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch100_read2324_strand.fast5")
        cls.dna3_handle = CreateLabels(cls.tmp_dna_file3, kmer_index=cls.kmer_index)
예제 #6
0
def trainModelTransitions(config):
    def process_sample(sample):
        options = dict(**DEFAULT_TRAINMODELS_OPTIONS)
        options.update(sample)
        if options["fast5_dir"] is None and options["fofn"] is None:
            raise RuntimeError(
                "Need to provide path to .fast5 files or file with filenames (fofn)"
            )
        reference_map = processReferenceFasta(
            fasta=config["reference"],
            work_folder=working_folder,
            motif_key=options["motif"],
            sub_char=options["label"],
            positions_file=options["positions_file"])
        if options["fast5_dir"] is not None:
            if options["fofn"] is not None:
                print(
                    "WARNING Only using files is directory %s ignoring fofn %s"
                    % (options["files_dir"], options["fofn"]))
            sample = Fast5Directory(options["fast5_dir"], reference_map)
        else:
            sample = FileOfFilenames(options["fofn"], reference_map)
        return sample

    # make directory to put the files we're using
    working_folder = FolderHandler()
    working_folder_path = working_folder.open_folder(config["output_dir"] +
                                                     "temp_trainModels")
    samples = [process_sample(s) for s in config["samples"]]

    if config["bwt"] is not None:
        print("[trainModels]Using provided BWT")
        bwa_ref_index = config["bwt"]
    else:
        print("signalAlign - indexing reference", file=sys.stderr)
        bwa_ref_index = getBwaIndex(config["reference"], working_folder_path)
        print("signalAlign - indexing reference, done", file=sys.stderr)

    template_model_path = config["in_T_Hmm"]
    complement_model_path = config["in_C_Hmm"]
    assert os.path.exists(template_model_path) and os.path.exists(complement_model_path), \
        "Missing input models %s and %s" % (template_model_path, complement_model_path)
    template_model = get_model(config["stateMachineType"], template_model_path)
    complement_model = get_model(
        config["stateMachineType"],
        complement_model_path) if config["twoD"] else None

    # get the input HDP, if we're using it
    if config["stateMachineType"] == "threeStateHdp":
        template_hdp = working_folder.add_file_path(
            "%s" % config["templateHdp"].split("/")[-1])
        copyfile(config["templateHdp"], template_hdp)
        if config["twoD"]:
            complement_hdp = working_folder.add_file_path(
                "%s" % config["complementHdp"].split("/")[-1])
            copyfile(config["complementHdp"], complement_hdp)
        else:
            complement_hdp = None
    else:
        template_hdp = None
        complement_hdp = None

    # make some paths to files to hold the HMMs
    template_hmm = working_folder.add_file_path("template_trained.hmm")
    complement_hmm = working_folder.add_file_path("complement_trained.hmm")
    trained_models = [template_hmm, complement_hmm]
    untrained_models = [template_model_path, complement_model_path]

    for default_model, trained_model in zip(untrained_models, trained_models):
        assert os.path.exists(
            default_model), "Didn't find default model {}".format(
                default_model)
        copyfile(default_model, trained_model)
        assert os.path.exists(
            trained_model), "Problem copying default model to {}".format(
                trained_model)

    # start iterating
    i = 0
    while i < config["iterations"]:
        # first cull a set of files to get expectations on
        training_files = cull_training_files(
            samples=samples,
            training_amount=config["training_bases"],
            twoD=config["twoD"])
        # setup
        workers = config["job_count"]
        work_queue = Manager().Queue()
        done_queue = Manager().Queue()
        jobs = []

        # get expectations for all the files in the queue
        # file_ref_tuple should be (fast5, (plus_ref_seq, minus_ref_seq))
        for fast5, ref_map in training_files:
            alignment_args = {
                "reference_map": ref_map,
                "destination": working_folder_path,
                "stateMachineType": config["stateMachineType"],
                "bwa_index": bwa_ref_index,
                "in_templateHmm": template_hmm,
                "in_complementHmm": complement_hmm,
                "in_templateHdp": template_hdp,
                "in_complementHdp": complement_hdp,
                "in_fast5": fast5,
                "threshold": 0.01,
                "diagonal_expansion": config["diagonal_expansion"],
                "constraint_trim": config["constraint_trim"],
                "target_regions": None,
                "degenerate": None,
                "twoD_chemistry": config["twoD"],
            }
            if config["DEBUG"]:
                alignment = SignalAlignment(**alignment_args)
                alignment.run(get_expectations=True)
            else:
                work_queue.put(alignment_args)

        for w in xrange(workers):
            p = Process(target=get_expectations, args=(work_queue, done_queue))
            p.start()
            jobs.append(p)
            work_queue.put('STOP')

        for p in jobs:
            p.join()

        done_queue.put('STOP')

        # load then normalize the expectations
        template_expectations_files = [
            x for x in os.listdir(working_folder_path)
            if x.endswith(".template.expectations")
        ]

        complement_expectations_files = [
            x for x in os.listdir(working_folder_path)
            if x.endswith(".complement.expectations")
        ]

        if len(template_expectations_files) > 0:
            add_and_norm_expectations(path=working_folder_path,
                                      files=template_expectations_files,
                                      model=template_model,
                                      hmm_file=template_hmm,
                                      update_transitions=True)

        if config["twoD"] and len(complement_expectations_files) > 0:
            add_and_norm_expectations(path=working_folder_path,
                                      files=complement_expectations_files,
                                      model=complement_model,
                                      hmm_file=complement_hmm,
                                      update_transitions=True)

        # log the running likelihood
        if len(template_model.running_likelihoods) > 0 and \
                (config["twoD"] and len(complement_model.running_likelihoods)) > 0:
            print("{i}| {t_likelihood}\t{c_likelihood}".format(
                t_likelihood=template_model.running_likelihoods[-1],
                c_likelihood=complement_model.running_likelihoods[-1],
                i=i))
            if config["TEST"] and (len(template_model.running_likelihoods) >= 2) and \
                    (config["twoD"] and len(complement_model.running_likelihoods) >= 2):
                print("TESTING")
                assert (template_model.running_likelihoods[-2] < template_model.running_likelihoods[-1]) and \
                       (complement_model.running_likelihoods[-2] < complement_model.running_likelihoods[-1]), \
                    "Testing: Likelihood error, went up"
        i += 1

    # if we're using HDP, trim the final Hmm (remove assignments)

    print("trainModels - finished training routine", file=sys.stdout)
    print("trainModels - finished training routine", file=sys.stderr)