def aligner(work_queue, done_queue): try: for f in iter(work_queue.get, 'STOP'): alignment = SignalAlignment(**f) alignment.run() except Exception as e: done_queue.put("%s failed with %s" % (current_process().name, e))
def get_expectations(work_queue, done_queue): try: for f in iter(work_queue.get, 'STOP'): alignment = SignalAlignment(**f) alignment.run(get_expectations=True) except Exception, e: done_queue.put("%s failed with %s" % (current_process().name, e.message))
def _aggregate_all_variantcalls(self): """Aggregate all the variant calls""" for v_tsv in self.variant_tsvs: if os.stat(v_tsv).st_size == 0: continue read_name = os.path.basename(v_tsv) variant_data = SignalAlignment.read_in_signal_align_tsv(v_tsv, "variantCaller") mv_h = MarginalizeVariants(variant_data, variants=self.variants, read_name=read_name) mv_h.get_data() if self.verbose: print(v_tsv) self.per_position_data = self.per_position_data.append(mv_h.position_probs, ignore_index=True) self.per_read_data = self.per_read_data.append(mv_h.per_read_calls, ignore_index=True) return True
def main(args): # parse args args = parse_args() command_line = " ".join(sys.argv[:]) print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr) # get absolute paths to inputs args.files_dir = resolvePath(args.files_dir) args.ref = resolvePath(args.ref) args.out = resolvePath(args.out) args.bwt = resolvePath(args.bwt) args.in_T_Hmm = resolvePath(args.in_T_Hmm) args.in_C_Hmm = resolvePath(args.in_C_Hmm) args.templateHDP = resolvePath(args.templateHDP) args.complementHDP = resolvePath(args.complementHDP) args.fofn = resolvePath(args.fofn) args.target_regions = resolvePath(args.target_regions) args.ambiguity_positions = resolvePath(args.ambiguity_positions) start_message = """ # Starting Signal Align # Aligning files from: {fileDir} # Aligning to reference: {reference} # Aligning maximum of {nbFiles} files # Using model: {model} # Using banding: True # Aligning to regions in: {regions} # Non-default template HMM: {inThmm} # Non-default complement HMM: {inChmm} # Template HDP: {tHdp} # Complement HDP: {cHdp} """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions, tHdp=args.templateHDP, cHdp=args.complementHDP) print(start_message, file=sys.stdout) if args.files_dir is None and args.fofn is None: print("Need to provide directory with .fast5 files of fofn", file=sys.stderr) sys.exit(1) if not os.path.isfile(args.ref): print("Did not find valid reference file, looked for it {here}".format(here=args.ref), file=sys.stderr) sys.exit(1) # make directory to put temporary files temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder(args.out + "/tempFiles_alignment") # forward_reference, backward_reference = processReferenceFasta(fasta=args.ref, motif_key=args.motif_key, work_folder=temp_folder, sub_char=args.ambig_char, positions_file=args.ambiguity_positions) # index the reference for bwa if args.bwt is not None: print("[RunSignalAlign]NOTICE - using provided BWT %s" % args.bwt) bwa_ref_index = args.bwt else: print("signalAlign - indexing reference", file=sys.stderr) bwa_ref_index = getBwaIndex(args.ref, temp_dir_path) print("signalAlign - indexing reference, done", file=sys.stderr) # setup workers for multiprocessing workers = args.nb_jobs work_queue = Manager().Queue() done_queue = Manager().Queue() jobs = [] # list of read files if args.fofn is not None: fast5s = [x for x in parseFofn(args.fofn) if x.endswith(".fast5")] else: fast5s = ["/".join([args.files_dir, x]) for x in os.listdir(args.files_dir) if x.endswith(".fast5")] nb_files = args.nb_files if nb_files < len(fast5s): shuffle(fast5s) fast5s = fast5s[:nb_files] # change paths to the source directory os.chdir(signalAlignSourceDir()) print("[runSignalAlign]:NOTICE: Got {} files to align".format(len(fast5s)), file=sys.stdout) for fast5 in fast5s: print(fast5) alignment_args = { "destination": temp_dir_path, "stateMachineType": args.stateMachineType, "bwa_index": bwa_ref_index, "in_templateHmm": args.in_T_Hmm, "in_complementHmm": args.in_C_Hmm, "in_templateHdp": args.templateHDP, "in_complementHdp": args.complementHDP, "output_format": args.outFmt, "in_fast5": fast5, "threshold": args.threshold, "diagonal_expansion": args.diag_expansion, "constraint_trim": args.constraint_trim, "degenerate": getDegenerateEnum(args.degenerate), "twoD_chemistry": args.twoD, "target_regions": args.target_regions, "embed": args.embed, "event_table": args.event_table, "backward_reference": backward_reference, "forward_reference": forward_reference } if args.DEBUG: alignment = SignalAlignment(**alignment_args) alignment.run() else: work_queue.put(alignment_args) for w in range(workers): p = Process(target=aligner, args=(work_queue, done_queue)) p.start() jobs.append(p) work_queue.put('STOP') for p in jobs: p.join() done_queue.put('STOP') print("\n# signalAlign - finished alignments\n", file=sys.stderr) print("\n# signalAlign - finished alignments\n", file=sys.stdout)
def setUpClass(cls): super(CreateLabelsTest, cls).setUpClass() cls.HOME = '/'.join(os.path.abspath(__file__).split("/")[:-4]) cls.fasta = os.path.join(cls.HOME, "tests/test_sequences/E.coli_K12.fasta") dna_file = os.path.join(cls.HOME, "tests/minion_test_reads/1D/LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch112_read108_strand.fast5") rev_dna_file = os.path.join(cls.HOME, "tests/minion_test_reads/1D/LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch6_read347_strand.fast5") rev_rna_file = os.path.join(cls.HOME, "tests/minion_test_reads/RNA_no_events/DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_61_ch_151_strand.fast5") forward_rna_file = os.path.join(cls.HOME, "tests/minion_test_reads/RNA_no_events/DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_36_ch_218_strand.fast5") rna_reference = os.path.join(cls.HOME, "tests/test_sequences/fake_rna_ref.fa") ecoli_dna_reference = os.path.join(cls.HOME, "tests/test_sequences/E.coli_K12.fasta") cls.dna_reference_handle = pysam.FastaFile(ecoli_dna_reference) cls.rna_reference_handle = pysam.FastaFile(rna_reference) cls.tmp_directory = tempfile.mkdtemp() # get file locations cls.tmp_dna_file = os.path.join(str(cls.tmp_directory), 'test_dna.fast5') cls.tmp_dna_file2 = os.path.join(str(cls.tmp_directory), 'test_dna2.fast5') cls.tmp_rna_file1 = os.path.join(str(cls.tmp_directory), 'test_rna.fast5') cls.tmp_rna_file2 = os.path.join(str(cls.tmp_directory), 'test_rna2.fast5') # run signalAlign on one file cls.rna_model_file = os.path.join(cls.HOME, "models/testModelR9p4_5mer_acgt_RNA.model") cls.dna_model_file_94 = os.path.join(cls.HOME, "models/testModelR9p4_5mer_acegt_template.model") cls.rna_sam = os.path.join(cls.HOME, "tests/minion_test_reads/RNA_edge_cases/rna_reads.bam") cls.dna_sam = os.path.join(cls.HOME, "tests/minion_test_reads/oneD.bam") cls.bin_path = os.path.join(cls.HOME, "bin") # kmer index cls.kmer_index = 2 # copy file to tmp directory shutil.copy(dna_file, cls.tmp_dna_file) shutil.copy(rev_dna_file, cls.tmp_dna_file2) shutil.copy(forward_rna_file, cls.tmp_rna_file1) shutil.copy(rev_rna_file, cls.tmp_rna_file2) args = create_signalAlignment_args(destination=cls.tmp_directory, in_templateHmm=cls.rna_model_file, alignment_file=cls.rna_sam, forward_reference=rna_reference, embed=True, path_to_bin=cls.bin_path, diagonal_expansion=5, delete_tmp=False) sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_rna_file1}])) sa_h.run() sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_rna_file2}])) sa_h.run() args = create_signalAlignment_args(destination=cls.tmp_directory, in_templateHmm=cls.dna_model_file_94, alignment_file=cls.dna_sam, forward_reference=ecoli_dna_reference, embed=True, path_to_bin=cls.bin_path, diagonal_expansion=10, traceBackDiagonals=100, constraint_trim=3) sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_dna_file}])) sa_h.run() sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_dna_file2}])) sa_h.run() cls.dna_handle = CreateLabels(cls.tmp_dna_file, kmer_index=cls.kmer_index) cls.dna_handle2 = CreateLabels(cls.tmp_dna_file2, kmer_index=cls.kmer_index) cls.rna1_handle = CreateLabels(cls.tmp_rna_file1, kmer_index=cls.kmer_index) cls.rna2_handle = CreateLabels(cls.tmp_rna_file2, kmer_index=cls.kmer_index) cls.rev_comp = ReverseComplement() cls.tmp_dna_file3 = os.path.join(cls.HOME, "tests/minion_test_reads/embedded_files/miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch100_read2324_strand.fast5") cls.dna3_handle = CreateLabels(cls.tmp_dna_file3, kmer_index=cls.kmer_index)
def trainModelTransitions(config): def process_sample(sample): options = dict(**DEFAULT_TRAINMODELS_OPTIONS) options.update(sample) if options["fast5_dir"] is None and options["fofn"] is None: raise RuntimeError( "Need to provide path to .fast5 files or file with filenames (fofn)" ) reference_map = processReferenceFasta( fasta=config["reference"], work_folder=working_folder, motif_key=options["motif"], sub_char=options["label"], positions_file=options["positions_file"]) if options["fast5_dir"] is not None: if options["fofn"] is not None: print( "WARNING Only using files is directory %s ignoring fofn %s" % (options["files_dir"], options["fofn"])) sample = Fast5Directory(options["fast5_dir"], reference_map) else: sample = FileOfFilenames(options["fofn"], reference_map) return sample # make directory to put the files we're using working_folder = FolderHandler() working_folder_path = working_folder.open_folder(config["output_dir"] + "temp_trainModels") samples = [process_sample(s) for s in config["samples"]] if config["bwt"] is not None: print("[trainModels]Using provided BWT") bwa_ref_index = config["bwt"] else: print("signalAlign - indexing reference", file=sys.stderr) bwa_ref_index = getBwaIndex(config["reference"], working_folder_path) print("signalAlign - indexing reference, done", file=sys.stderr) template_model_path = config["in_T_Hmm"] complement_model_path = config["in_C_Hmm"] assert os.path.exists(template_model_path) and os.path.exists(complement_model_path), \ "Missing input models %s and %s" % (template_model_path, complement_model_path) template_model = get_model(config["stateMachineType"], template_model_path) complement_model = get_model( config["stateMachineType"], complement_model_path) if config["twoD"] else None # get the input HDP, if we're using it if config["stateMachineType"] == "threeStateHdp": template_hdp = working_folder.add_file_path( "%s" % config["templateHdp"].split("/")[-1]) copyfile(config["templateHdp"], template_hdp) if config["twoD"]: complement_hdp = working_folder.add_file_path( "%s" % config["complementHdp"].split("/")[-1]) copyfile(config["complementHdp"], complement_hdp) else: complement_hdp = None else: template_hdp = None complement_hdp = None # make some paths to files to hold the HMMs template_hmm = working_folder.add_file_path("template_trained.hmm") complement_hmm = working_folder.add_file_path("complement_trained.hmm") trained_models = [template_hmm, complement_hmm] untrained_models = [template_model_path, complement_model_path] for default_model, trained_model in zip(untrained_models, trained_models): assert os.path.exists( default_model), "Didn't find default model {}".format( default_model) copyfile(default_model, trained_model) assert os.path.exists( trained_model), "Problem copying default model to {}".format( trained_model) # start iterating i = 0 while i < config["iterations"]: # first cull a set of files to get expectations on training_files = cull_training_files( samples=samples, training_amount=config["training_bases"], twoD=config["twoD"]) # setup workers = config["job_count"] work_queue = Manager().Queue() done_queue = Manager().Queue() jobs = [] # get expectations for all the files in the queue # file_ref_tuple should be (fast5, (plus_ref_seq, minus_ref_seq)) for fast5, ref_map in training_files: alignment_args = { "reference_map": ref_map, "destination": working_folder_path, "stateMachineType": config["stateMachineType"], "bwa_index": bwa_ref_index, "in_templateHmm": template_hmm, "in_complementHmm": complement_hmm, "in_templateHdp": template_hdp, "in_complementHdp": complement_hdp, "in_fast5": fast5, "threshold": 0.01, "diagonal_expansion": config["diagonal_expansion"], "constraint_trim": config["constraint_trim"], "target_regions": None, "degenerate": None, "twoD_chemistry": config["twoD"], } if config["DEBUG"]: alignment = SignalAlignment(**alignment_args) alignment.run(get_expectations=True) else: work_queue.put(alignment_args) for w in xrange(workers): p = Process(target=get_expectations, args=(work_queue, done_queue)) p.start() jobs.append(p) work_queue.put('STOP') for p in jobs: p.join() done_queue.put('STOP') # load then normalize the expectations template_expectations_files = [ x for x in os.listdir(working_folder_path) if x.endswith(".template.expectations") ] complement_expectations_files = [ x for x in os.listdir(working_folder_path) if x.endswith(".complement.expectations") ] if len(template_expectations_files) > 0: add_and_norm_expectations(path=working_folder_path, files=template_expectations_files, model=template_model, hmm_file=template_hmm, update_transitions=True) if config["twoD"] and len(complement_expectations_files) > 0: add_and_norm_expectations(path=working_folder_path, files=complement_expectations_files, model=complement_model, hmm_file=complement_hmm, update_transitions=True) # log the running likelihood if len(template_model.running_likelihoods) > 0 and \ (config["twoD"] and len(complement_model.running_likelihoods)) > 0: print("{i}| {t_likelihood}\t{c_likelihood}".format( t_likelihood=template_model.running_likelihoods[-1], c_likelihood=complement_model.running_likelihoods[-1], i=i)) if config["TEST"] and (len(template_model.running_likelihoods) >= 2) and \ (config["twoD"] and len(complement_model.running_likelihoods) >= 2): print("TESTING") assert (template_model.running_likelihoods[-2] < template_model.running_likelihoods[-1]) and \ (complement_model.running_likelihoods[-2] < complement_model.running_likelihoods[-1]), \ "Testing: Likelihood error, went up" i += 1 # if we're using HDP, trim the final Hmm (remove assignments) print("trainModels - finished training routine", file=sys.stdout) print("trainModels - finished training routine", file=sys.stderr)