def main(args): # parse args args = parse_args() command_line = " ".join(sys.argv[:]) print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr) start_message = """ # Starting Empire Error-Correction # Aligning files from: {fileDir} # Aligning to reference: {reference} # Aligning maximum of {nbFiles} files # Using model: {model} # Using banding: {banding} # Aligning to regions in: {regions} # Non-default template HMM: {inThmm} # Non-default complement HMM: {inChmm} # Template HDP: {tHdp} # Complement HDP: {cHdp} """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, banding=args.banded, inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions, tHdp=args.templateHDP, cHdp=args.complementHDP) print(start_message, file=sys.stdout) if not os.path.isfile(args.ref): print("Did not find valid reference file", file=sys.stderr) sys.exit(1) temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder(args.out + "tempFiles_errorCorrection") reference_sequence = args.ref for cycle in range(0, args.cycles): check, reference_sequence_length = write_degenerate_reference_set(input_fasta=reference_sequence, out_path=temp_dir_path, step=STEP) assert check, "Problem making degenerate reference sequence set" # index the reference for bwa print("signalAlign - indexing reference", file=sys.stderr) bwa_ref_index = get_bwa_index(reference_sequence, temp_dir_path) print("signalAlign - indexing reference, done", file=sys.stderr) # setup workers for multiprocessing workers = args.nb_jobs work_queue = Manager().Queue() done_queue = Manager().Queue() jobs = [] # list of alignment files fast5s = [x for x in os.listdir(args.files_dir) if x.endswith(".fast5")] # take only some if args.nb_files < len(fast5s): shuffle(fast5s) fast5s = fast5s[:args.nb_files] for fast5 in fast5s: alignment_args = { "forward_reference": None, "backward_reference": None, "path_to_EC_refs": temp_dir_path, "destination": temp_dir_path, "stateMachineType": args.stateMachineType, "bwa_index": bwa_ref_index, "in_templateHmm": args.in_T_Hmm, "in_complementHmm": args.in_C_Hmm, "in_templateHdp": args.templateHDP, "in_complementHdp": args.complementHDP, "banded": args.banded, "sparse_output": True, "in_fast5": args.files_dir + fast5, "threshold": args.threshold, "diagonal_expansion": args.diag_expansion, "constraint_trim": args.constraint_trim, "target_regions": None, "degenerate": degenerate_enum(args.degenerate), } #alignment = SignalAlignment(**alignment_args) #alignment.run() work_queue.put(alignment_args) for w in xrange(workers): p = Process(target=aligner, args=(work_queue, done_queue)) p.start() jobs.append(p) work_queue.put('STOP') for p in jobs: p.join() done_queue.put('STOP') print("\n# signalAlign - finished alignments\n", file=sys.stderr) print("\n# signalAlign - finished alignments\n", file=sys.stdout) # working sequence is a string, that has the reference we're going to update this cycle working_sequence = get_first_sequence(reference_sequence) # register is the relative position that is being N-ed: # ACGTAGACAATA --> NCGTAGNCAATA = register 0 # ACGTAGACAATA --> ANGTAGANAATA = register 1 ... for register in range(0, STEP): print("# Starting Variant Calling, register: {}...".format(register), file=sys.stdout, end='\n') print("# Starting Variant Calling, register: {}...".format(register), file=sys.stderr, end='') # cull the alignment files for this register alns, forward_mask = get_alignments_labels_and_mask( path_to_alignments=temp_dir_path + "*.tsv.{}".format(register), max=args.nb_files, suffix=".{}".format(register) ) # this is the list of positions that we're going to look at, based on this register degenerate_positions = {'forward': range(register, reference_sequence_length, STEP), 'backward': range(register, reference_sequence_length, STEP) } # place to put the marginal probs variant_call_file = temp_folder.add_file_path("variants.{cycle}.{reg}.calls".format(cycle=cycle, reg=register)) # arguments for multiprocessing for aln, forward_bool in zip(alns, forward_mask): call_methyl_args = { "sequence": None, "alignment_file": aln, "forward": forward_bool, "out_file": variant_call_file, "positions": degenerate_positions, "degenerate_type": degenerate_enum(args.degenerate), } #c = CallMethylation(**call_methyl_args) #c.write() work_queue.put(call_methyl_args) for w in xrange(workers): p = Process(target=run_methyl_caller, args=(work_queue, done_queue)) p.start() jobs.append(p) work_queue.put('STOP') for p in jobs: p.join() done_queue.put('STOP') # this is where the per-register update happens working_sequence = update_reference(variant_call_file, working_sequence, register, min_depth=0, get_sites=False) # remove alignments for this register for f in glob.glob(temp_dir_path + "*.tsv.{}".format(register)): os.remove(f) print("done", file=sys.stdout, end="\n") print("done", file=sys.stderr, end="\n") # add a file for this cycle ref_path = temp_folder.add_file_path("iteration.{cycle}.fa".format(cycle=cycle)) # write it to a file write_fasta("iteration.{cycle}.fa".format(cycle=cycle), working_sequence, open(ref_path, 'w')) # update the path to the reference for the next cycle reference_sequence = ref_path return
def main(args): # parse args args = parse_args() command_line = " ".join(sys.argv[:]) print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr) start_message = """ # Starting Signal Align # Aligning files from: {fileDir} # Aligning to reference: {reference} # Aligning maximum of {nbFiles} files # Using model: {model} # Using banding: True # Aligning to regions in: {regions} # Non-default template HMM: {inThmm} # Non-default complement HMM: {inChmm} # Template HDP: {tHdp} # Complement HDP: {cHdp} """.format( fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, #banding=args.banded, inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions, tHdp=args.templateHDP, cHdp=args.complementHDP) print(start_message, file=sys.stdout) if args.files_dir is None and args.fofn is None: print("Need to provide directory with .fast5 files of fofn", file=sys.stderr) sys.exit(1) if not os.path.isfile(args.ref): print("Did not find valid reference file, looked for it {here}".format( here=args.ref), file=sys.stderr) sys.exit(1) # make directory to put temporary files temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder(args.out + "tempFiles_alignment") if args.error_correct is True: print( "[runSignalAlign]:ERROR: Error correction not implemented, yet\n", file=sys.stderr) sys.exit(1) #write_degenerate_reference_set(input_fasta=args.ref, out_path=temp_dir_path) #plus_strand_sequence = None #minus_strand_sequence = None else: # parse the substitution file, if given plus_strand_sequence = temp_folder.add_file_path( "forward_reference.txt") minus_strand_sequence = temp_folder.add_file_path( "backward_reference.txt") if args.substitution_file is not None: add_ambiguity_chars_to_reference( input_fasta=args.ref, substitution_file=args.substitution_file, sequence_outfile=plus_strand_sequence, rc_sequence_outfile=minus_strand_sequence, degenerate_type=args.degenerate, sub_char=args.ambig_char) else: make_temp_sequence(fasta=args.ref, sequence_outfile=plus_strand_sequence, rc_sequence_outfile=minus_strand_sequence) # index the reference for bwa print("signalAlign - indexing reference", file=sys.stderr) bwa_ref_index = get_bwa_index(args.ref, temp_dir_path) print("signalAlign - indexing reference, done", file=sys.stderr) # parse the target regions, if provided # TODO make this the same as the 'labels' file if args.target_regions is not None: target_regions = TargetRegions(args.target_regions) else: target_regions = None # setup workers for multiprocessing workers = args.nb_jobs work_queue = Manager().Queue() done_queue = Manager().Queue() jobs = [] # list of read files if args.fofn is not None: fast5s = [x for x in parse_fofn(args.fofn) if x.endswith(".fast5")] else: fast5s = [ args.files_dir + x for x in os.listdir(args.files_dir) if x.endswith(".fast5") ] nb_files = args.nb_files if nb_files < len(fast5s): shuffle(fast5s) fast5s = fast5s[:nb_files] print("[runSignalAlign]:NOTICE: Got {} files to align".format(len(fast5s)), file=sys.stdout) for fast5 in fast5s: alignment_args = { "forward_reference": plus_strand_sequence, "backward_reference": minus_strand_sequence, "path_to_EC_refs": (temp_dir_path if args.error_correct else None), "destination": temp_dir_path, "stateMachineType": args.stateMachineType, "bwa_index": bwa_ref_index, "in_templateHmm": args.in_T_Hmm, "in_complementHmm": args.in_C_Hmm, "in_templateHdp": args.templateHDP, "in_complementHdp": args.complementHDP, "banded": True, #args.banded, "output_format": args.outFmt, #"in_fast5": args.files_dir + fast5, "in_fast5": fast5, "threshold": args.threshold, "diagonal_expansion": args.diag_expansion, "constraint_trim": args.constraint_trim, "target_regions": target_regions, "degenerate": degenerate_enum(args.degenerate), } #alignment = SignalAlignment(**alignment_args) #alignment.run() work_queue.put(alignment_args) for w in xrange(workers): p = Process(target=aligner, args=(work_queue, done_queue)) p.start() jobs.append(p) work_queue.put('STOP') for p in jobs: p.join() done_queue.put('STOP') print("\n# signalAlign - finished alignments\n", file=sys.stderr) print("\n# signalAlign - finished alignments\n", file=sys.stdout) if args.outFmt == "variantCaller": concat_variant_call_files(temp_dir_path)
def main(args): # parse args args = parse_args() command_line = " ".join(sys.argv[:]) print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr) start_message = """ # Starting Jamison Error-Correction # Aligning files from: {fileDir} # Aligning to reference: {reference} # Aligning maximum of {nbFiles} files # Using model: {model} # Using banding: {banding} # Aligning to regions in: {regions} # Non-default template HMM: {inThmm} # Non-default complement HMM: {inChmm} # Template HDP: {tHdp} # Complement HDP: {cHdp} # Performing {cycles} cycles """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, banding=args.banded, inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions, tHdp=args.templateHDP, cHdp=args.complementHDP, cycles=args.cycles) print(start_message, file=sys.stdout) if not os.path.isfile(args.ref): print("Did not find valid reference file", file=sys.stderr) sys.exit(1) temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder(args.out + "tempFiles_errorCorrection") # initialize to input fasta reference_sequence_path = args.ref # list of alignment files fast5s = cull_fast5_files(args.files_dir, args.nb_files) for cycle in range(0, args.cycles): # index the reference for bwa this is a string with the path to the index bwa_ref_index = get_bwa_index(reference_sequence_path, temp_dir_path) # unpack the reference sequence reference_sequence_string = get_first_sequence(reference_sequence_path) alignment_args = { "path_to_EC_refs": None, "destination": temp_dir_path, "stateMachineType": args.stateMachineType, "bwa_index": bwa_ref_index, "in_templateHmm": args.in_T_Hmm, "in_complementHmm": args.in_C_Hmm, "in_templateHdp": args.templateHDP, "in_complementHdp": args.complementHDP, "banded": args.banded, "sparse_output": True, "threshold": args.threshold, "diagonal_expansion": args.diag_expansion, "constraint_trim": args.constraint_trim, "target_regions": None, "degenerate": degenerate_enum(args.degenerate), } proposals = scan_for_proposals(temp_folder, STEP, reference_sequence_string, fast5s, alignment_args, args.nb_jobs) proposals = group_sites_in_window(proposals, 6) print("Cycle {cycle} - Got {nb} sites to check: {sites}".format(nb=len(proposals), sites=proposals, cycle=cycle)) updated_reference_string = update_reference_with_marginal_probs(temp_folder, proposals, reference_sequence_string, fast5s, alignment_args, args.nb_jobs) updated_reference_path = temp_folder.add_file_path("cycle_snapshot.{cycle}.fa".format(cycle=cycle)) write_fasta("jamison{}".format(cycle), updated_reference_string, open(updated_reference_path, 'w')) reference_sequence_path = updated_reference_path # copy final file copyfile(reference_sequence_path, temp_dir_path + args.corrected) return
def main(args): # parse args args = parse_args() command_line = " ".join(sys.argv[:]) print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr) start_message = """ # Starting Empire Error-Correction # Aligning files from: {fileDir} # Aligning to reference: {reference} # Aligning maximum of {nbFiles} files # Using model: {model} # Using banding: {banding} # Aligning to regions in: {regions} # Non-default template HMM: {inThmm} # Non-default complement HMM: {inChmm} # Template HDP: {tHdp} # Complement HDP: {cHdp} """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, banding=args.banded, inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions, tHdp=args.templateHDP, cHdp=args.complementHDP) print(start_message, file=sys.stdout) if not os.path.isfile(args.ref): print("Did not find valid reference file", file=sys.stderr) sys.exit(1) temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder(args.out + "tempFiles_errorCorrection") reference_sequence = args.ref for cycle in range(0, args.cycles): check, reference_sequence_length = write_degenerate_reference_set( input_fasta=reference_sequence, out_path=temp_dir_path, step=STEP) assert check, "Problem making degenerate reference sequence set" # index the reference for bwa print("signalAlign - indexing reference", file=sys.stderr) bwa_ref_index = get_bwa_index(reference_sequence, temp_dir_path) print("signalAlign - indexing reference, done", file=sys.stderr) # setup workers for multiprocessing workers = args.nb_jobs work_queue = Manager().Queue() done_queue = Manager().Queue() jobs = [] # list of alignment files fast5s = [ x for x in os.listdir(args.files_dir) if x.endswith(".fast5") ] # take only some if args.nb_files < len(fast5s): shuffle(fast5s) fast5s = fast5s[:args.nb_files] for fast5 in fast5s: alignment_args = { "forward_reference": None, "backward_reference": None, "path_to_EC_refs": temp_dir_path, "destination": temp_dir_path, "stateMachineType": args.stateMachineType, "bwa_index": bwa_ref_index, "in_templateHmm": args.in_T_Hmm, "in_complementHmm": args.in_C_Hmm, "in_templateHdp": args.templateHDP, "in_complementHdp": args.complementHDP, "banded": args.banded, "sparse_output": True, "in_fast5": args.files_dir + fast5, "threshold": args.threshold, "diagonal_expansion": args.diag_expansion, "constraint_trim": args.constraint_trim, "target_regions": None, "degenerate": degenerate_enum(args.degenerate), } #alignment = SignalAlignment(**alignment_args) #alignment.run() work_queue.put(alignment_args) for w in xrange(workers): p = Process(target=aligner, args=(work_queue, done_queue)) p.start() jobs.append(p) work_queue.put('STOP') for p in jobs: p.join() done_queue.put('STOP') print("\n# signalAlign - finished alignments\n", file=sys.stderr) print("\n# signalAlign - finished alignments\n", file=sys.stdout) # working sequence is a string, that has the reference we're going to update this cycle working_sequence = get_first_sequence(reference_sequence) # register is the relative position that is being N-ed: # ACGTAGACAATA --> NCGTAGNCAATA = register 0 # ACGTAGACAATA --> ANGTAGANAATA = register 1 ... for register in range(0, STEP): print("# Starting Variant Calling, register: {}...".format( register), file=sys.stdout, end='\n') print("# Starting Variant Calling, register: {}...".format( register), file=sys.stderr, end='') # cull the alignment files for this register alns, forward_mask = get_alignments_labels_and_mask( path_to_alignments=temp_dir_path + "*.tsv.{}".format(register), max=args.nb_files, suffix=".{}".format(register)) # this is the list of positions that we're going to look at, based on this register degenerate_positions = { 'forward': range(register, reference_sequence_length, STEP), 'backward': range(register, reference_sequence_length, STEP) } # place to put the marginal probs variant_call_file = temp_folder.add_file_path( "variants.{cycle}.{reg}.calls".format(cycle=cycle, reg=register)) # arguments for multiprocessing for aln, forward_bool in zip(alns, forward_mask): call_methyl_args = { "sequence": None, "alignment_file": aln, "forward": forward_bool, "out_file": variant_call_file, "positions": degenerate_positions, "degenerate_type": degenerate_enum(args.degenerate), } #c = CallMethylation(**call_methyl_args) #c.write() work_queue.put(call_methyl_args) for w in xrange(workers): p = Process(target=run_methyl_caller, args=(work_queue, done_queue)) p.start() jobs.append(p) work_queue.put('STOP') for p in jobs: p.join() done_queue.put('STOP') # this is where the per-register update happens working_sequence = update_reference(variant_call_file, working_sequence, register, min_depth=0, get_sites=False) # remove alignments for this register for f in glob.glob(temp_dir_path + "*.tsv.{}".format(register)): os.remove(f) print("done", file=sys.stdout, end="\n") print("done", file=sys.stderr, end="\n") # add a file for this cycle ref_path = temp_folder.add_file_path( "iteration.{cycle}.fa".format(cycle=cycle)) # write it to a file write_fasta("iteration.{cycle}.fa".format(cycle=cycle), working_sequence, open(ref_path, 'w')) # update the path to the reference for the next cycle reference_sequence = ref_path return
def main(args): # parse args args = parse_args() start_message = """ # Starting Signal Align # Aligning files from: {fileDir} # Aligning to reference: {reference} # Aligning {nbFiles} # Using model: {model} # Using banding: {banding} DEPRECIATE this ASAP # Aligning to regions in: {regions} # Input template HMM: {inThmm} # Input complement HMM: {inChmm} """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, banding=args.banded, inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions) print(start_message, file=sys.stdout) if not os.path.isfile(args.ref): print("Did not find valid reference file", file=sys.stderr) sys.exit(1) # make directory to put temporary files temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder(args.out + "tempFiles_alignment") reference_seq = temp_folder.add_file_path("reference_seq.txt") make_temp_sequence(args.ref, True, reference_seq) # index the reference for bwa print("signalAlign - indexing reference", file=sys.stderr) bwa_ref_index = get_bwa_index(args.ref, temp_dir_path) print("signalAlign - indexing reference, done", file=sys.stderr) # parse the target regions, if provided if args.target_regions is not None: target_regions = TargetRegions(args.target_regions) else: target_regions = None workers = args.nb_jobs work_queue = Manager().Queue() done_queue = Manager().Queue() jobs = [] fast5s = [x for x in os.listdir(args.files_dir) if x.endswith(".fast5")] nb_files = args.nb_files if nb_files < len(fast5s): shuffle(fast5s) fast5s = fast5s[:nb_files] for fast5 in fast5s: alignment_args = { "reference": reference_seq, "destination": temp_dir_path, "stateMachineType": args.stateMachineType, "bwa_index": bwa_ref_index, "in_templateHmm": args.in_T_Hmm, "in_complementHmm": args.in_C_Hmm, "banded": args.banded, "in_fast5": args.files_dir + fast5, "threshold": args.threshold, "diagonal_expansion": args.diag_expansion, "constraint_trim": args.constraint_trim, "target_regions": target_regions, } #alignment = SignalAlignment(**alignment_args) #alignment.run() work_queue.put(alignment_args) for w in xrange(workers): p = Process(target=aligner, args=(work_queue, done_queue)) p.start() jobs.append(p) work_queue.put('STOP') for p in jobs: p.join() done_queue.put('STOP') print("\n# signalAlign - finished alignments\n", file=sys.stderr) print("\n# signalAlign - finished alignments\n", file=sys.stdout)
def main(args): # parse args args = parse_args() command_line = " ".join(sys.argv[:]) print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr) start_message = """ # Starting Zayante Error-Correction # Aligning files from: {fileDir} # Aligning to reference: {reference} # Aligning maximum of {nbFiles} files # Using model: {model} # Using banding: {banding} # Aligning to regions in: {regions} # Non-default template HMM: {inThmm} # Non-default complement HMM: {inChmm} # Template HDP: {tHdp} # Complement HDP: {cHdp} """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, banding=args.banded, inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions, tHdp=args.templateHDP, cHdp=args.complementHDP) print(start_message, file=sys.stdout) if not os.path.isfile(args.ref): print("Did not find valid reference file", file=sys.stderr) sys.exit(1) temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder(args.out + "tempFiles_errorCorrection") reference_sequence = args.ref STEP = 10 for cycle in range(0, 8): for it in range(0, STEP): # make paths for reference files forward_reference = temp_folder.add_file_path("forward_reference.{cycle}.{iter}.txt".format(cycle=cycle, iter=it)) backward_reference = temp_folder.add_file_path("backward_reference.{cycle}.{iter}.txt".format(cycle=cycle, iter=it)) # make N-ed reference sequence for this iteration deg, reference_sequence_length = make_degenerate_reference(reference_sequence, it, forward_reference, backward_reference, step=STEP) assert deg, "Problem making degenerate reference for cycle {cycle} iteration {iter}" \ "".format(cycle=cycle, iter=it) # index the reference for bwa print("signalAlign - indexing reference", file=sys.stderr) bwa_ref_index = get_bwa_index(args.ref, temp_dir_path) print("signalAlign - indexing reference, done", file=sys.stderr) # setup workers for multiprocessing workers = args.nb_jobs work_queue = Manager().Queue() done_queue = Manager().Queue() jobs = [] # list of alignment files fast5s = [x for x in os.listdir(args.files_dir) if x.endswith(".fast5")] # take only some if args.nb_files < len(fast5s): shuffle(fast5s) fast5s = fast5s[:args.nb_files] for fast5 in fast5s: alignment_args = { "forward_reference": forward_reference, "backward_reference": backward_reference, "path_to_EC_refs": None, "destination": temp_dir_path, "stateMachineType": args.stateMachineType, "bwa_index": bwa_ref_index, "in_templateHmm": args.in_T_Hmm, "in_complementHmm": args.in_C_Hmm, "in_templateHdp": args.templateHDP, "in_complementHdp": args.complementHDP, "banded": args.banded, "sparse_output": True, "in_fast5": args.files_dir + fast5, "threshold": args.threshold, "diagonal_expansion": args.diag_expansion, "constraint_trim": args.constraint_trim, "target_regions": None, "degenerate": degenerate_enum(args.degenerate), } #alignment = SignalAlignment(**alignment_args) #alignment.run() work_queue.put(alignment_args) for w in xrange(workers): p = Process(target=aligner, args=(work_queue, done_queue)) p.start() jobs.append(p) work_queue.put('STOP') for p in jobs: p.join() done_queue.put('STOP') print("\n# signalAlign - finished alignments\n", file=sys.stderr) print("\n# signalAlign - finished alignments\n", file=sys.stdout) print("\n# Starting Variant Calling\n", file=sys.stdout) print("\n# Starting Variant Calling\n", file=sys.stderr) # cull the alignment files alns, forward_mask = get_alignments_labels_and_mask(temp_dir_path + "*.tsv", args.nb_files) degenerate_positions = { 'forward': range(it, reference_sequence_length, STEP), 'backward': range(it, reference_sequence_length, STEP) } variant_call_file = temp_folder.add_file_path("variants.{cycle}.{iter}.calls".format(cycle=cycle, iter=it)) for aln, forward_bool in zip(alns, forward_mask): call_methyl_args = { "sequence": None, "alignment_file": aln, "forward": forward_bool, "out_file": variant_call_file, "positions": degenerate_positions, "degenerate_type": degenerate_enum(args.degenerate), } #c = CallMethylation(**call_methyl_args) #c.write() work_queue.put(call_methyl_args) for w in xrange(workers): p = Process(target=run_methyl_caller, args=(work_queue, done_queue)) p.start() jobs.append(p) work_queue.put('STOP') for p in jobs: p.join() done_queue.put('STOP') print("\n# Finished Variant Calling\n", file=sys.stdout) print("\n# Finished Variant Calling\n", file=sys.stderr) new_ref = update_reference(variant_call_file, reference_sequence, 0) ref_path = temp_folder.add_file_path("iteration.{cycle}.{iter}.fa".format(cycle=cycle, iter=it)) write_fasta("iteration.{cycle}.{iter}.fa".format(cycle=cycle, iter=it), new_ref, open(ref_path, 'w')) reference_sequence = ref_path # remove old alignments for f in glob.glob(temp_dir_path + "*.tsv"): os.remove(f) STEP -= 1 return
def main(args): # parse args args = parse_args() command_line = " ".join(sys.argv[:]) print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr) start_message = """ # Starting Jamison Error-Correction # Aligning files from: {fileDir} # Aligning to reference: {reference} # Aligning maximum of {nbFiles} files # Using model: {model} # Using banding: {banding} # Aligning to regions in: {regions} # Non-default template HMM: {inThmm} # Non-default complement HMM: {inChmm} # Template HDP: {tHdp} # Complement HDP: {cHdp} # Performing {cycles} cycles """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, banding=args.banded, inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions, tHdp=args.templateHDP, cHdp=args.complementHDP, cycles=args.cycles) print(start_message, file=sys.stdout) if not os.path.isfile(args.ref): print("Did not find valid reference file", file=sys.stderr) sys.exit(1) temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder(args.out + "tempFiles_errorCorrection") # initialize to input fasta reference_sequence_path = args.ref # list of alignment files fast5s = cull_fast5_files(args.files_dir, args.nb_files) for cycle in range(0, args.cycles): # index the reference for bwa this is a string with the path to the index bwa_ref_index = get_bwa_index(reference_sequence_path, temp_dir_path) # unpack the reference sequence reference_sequence_string = get_first_sequence(reference_sequence_path) alignment_args = { "path_to_EC_refs": None, "destination": temp_dir_path, "stateMachineType": args.stateMachineType, "bwa_index": bwa_ref_index, "in_templateHmm": args.in_T_Hmm, "in_complementHmm": args.in_C_Hmm, "in_templateHdp": args.templateHDP, "in_complementHdp": args.complementHDP, "banded": args.banded, "sparse_output": True, "threshold": args.threshold, "diagonal_expansion": args.diag_expansion, "constraint_trim": args.constraint_trim, "target_regions": None, "degenerate": degenerate_enum(args.degenerate), } proposals = scan_for_proposals(temp_folder, STEP, reference_sequence_string, fast5s, alignment_args, args.nb_jobs) proposals = group_sites_in_window(proposals, 6) print("Cycle {cycle} - Got {nb} sites to check: {sites}".format( nb=len(proposals), sites=proposals, cycle=cycle)) updated_reference_string = update_reference_with_marginal_probs( temp_folder, proposals, reference_sequence_string, fast5s, alignment_args, args.nb_jobs) updated_reference_path = temp_folder.add_file_path( "cycle_snapshot.{cycle}.fa".format(cycle=cycle)) write_fasta("jamison{}".format(cycle), updated_reference_string, open(updated_reference_path, 'w')) reference_sequence_path = updated_reference_path # copy final file copyfile(reference_sequence_path, temp_dir_path + args.corrected) return
def run(self, get_expectations=False): # file checks if os.path.isfile(self.in_fast5) is False: print("signalAlign - problem with file path {file}".format(file=self.in_fast5)) return False # Preamble set up before doing the alignment # containers and defaults read_label = self.in_fast5.split("/")[-1] # used in the posteriors file as identifier read_name = self.in_fast5.split("/")[-1][:-6] # get the name without the '.fast5' # object for handling temporary files temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder(self.destination + "tempFiles_{readLabel}".format(readLabel=read_label)) # read-specific files, could be removed later but are kept right now to make it easier to rerun commands temp_np_read = temp_folder.add_file_path("temp_{read}.npRead".format(read=read_label)) temp_2d_read = temp_folder.add_file_path("temp_2Dseq_{read}.fa".format(read=read_label)) temp_t_model = temp_folder.add_file_path("template_model.model") temp_c_model = temp_folder.add_file_path("complement_model.model") # make the npRead and fasta todo make this assert success, temp_t_model, temp_c_model = get_npRead_2dseq_and_models( fast5=self.in_fast5, npRead_path=temp_np_read, twod_read_path=temp_2d_read, template_model_path=temp_t_model, complement_model_path=temp_c_model, ) if success is False: return False # add an indicator for the model being used if self.stateMachineType == "threeState": model_label = ".sm" stateMachineType_flag = "--s " elif self.stateMachineType == "fourState": model_label = ".4s" stateMachineType_flag = "--f " elif self.stateMachineType == "echelon": model_label = ".e" stateMachineType_flag = "--e " else: model_label = ".vl" stateMachineType_flag = "" # get orientation and cigar from BWA this serves as the guide alignment cigar_string, strand = exonerated_bwa( bwa_index=self.bwa_index, query=temp_2d_read, target_regions=self.target_regions ) # this gives the format: /directory/for/files/file.model.orientation.tsv posteriors_file_path = "" # forward strand if strand == "+": forward = True posteriors_file_path = self.destination + read_name + model_label + ".forward.tsv" # backward strand if strand == "-": forward = False posteriors_file_path = self.destination + read_name + model_label + ".backward.tsv" # didn't map elif (strand != "+") and (strand != "-"): print("signalAlign - {} didn't map".format(read_label), file=sys.stderr) temp_folder.remove_folder() return False # Alignment routine # containers and defaults # temp_ref_seq = temp_folder.add_file_path("temp_ref_seq.txt") path_to_vanillaAlign = "./vanillaAlign" # todo could require this in path # make sequence for vanillaAlign, we orient the sequence so that the template events align to the # reference and the complement events align to the reverse complement of the reference # make_temp_sequence(self.reference, forward, temp_ref_seq) # alignment flags # input (match) models if self.in_templateModel is not None: template_model_flag = "-T {model_loc} ".format(model_loc=self.in_templateModel) if temp_t_model is not None: template_model_flag = "-T {t_model} ".format(t_model=temp_t_model) else: template_model_flag = "" if self.in_complementModel is not None: complement_model_flag = "-C {model_loc} ".format(model_loc=self.in_complementModel) if temp_c_model is not None: complement_model_flag = "-C {c_model} ".format(c_model=temp_c_model) else: complement_model_flag = "" # input HMMs if self.in_templateHmm is not None: template_hmm_flag = "-y {hmm_loc} ".format(hmm_loc=self.in_templateHmm) else: template_hmm_flag = "" if self.in_complementHmm is not None: complement_hmm_flag = "-z {hmm_loc} ".format(hmm_loc=self.in_complementHmm) else: complement_hmm_flag = "" # threshold if self.threshold is not None: threshold_flag = "-d {threshold} ".format(threshold=self.threshold) else: threshold_flag = "" # diagonal expansion if self.diagonal_expansion is not None: diag_expansion_flag = "-x {expansion} ".format(expansion=self.diagonal_expansion) else: diag_expansion_flag = "" # constraint trim if self.constraint_trim is not None: trim_flag = "-m {trim} ".format(trim=self.constraint_trim) else: trim_flag = "" # banded alignment if self.banded is True: banded_flag = "--b " else: banded_flag = "" # commands if get_expectations: template_expectations_file_path = self.destination + read_name + ".template.expectations" complement_expectations_file_path = self.destination + read_name + ".complement.expectations" command = ( "echo {cigar} | {vA} {banded}{model}-r {ref} -q {npRead} {t_model}{c_model}{t_hmm}{c_hmm}{thresh}" "{expansion}{trim} -L {readLabel} -t {templateExpectations} -c {complementExpectations}".format( cigar=cigar_string, vA=path_to_vanillaAlign, model=stateMachineType_flag, banded=banded_flag, ref=self.reference, readLabel=read_label, npRead=temp_np_read, t_model=template_model_flag, c_model=complement_model_flag, t_hmm=template_hmm_flag, c_hmm=complement_hmm_flag, templateExpectations=template_expectations_file_path, complementExpectations=complement_expectations_file_path, thresh=threshold_flag, expansion=diag_expansion_flag, trim=trim_flag, ) ) else: command = ( "echo {cigar} | {vA} {banded}{model}-r {ref} -q {npRead} {t_model}{c_model}{t_hmm}{c_hmm}{thresh}" "{expansion}{trim} -u {posteriors} -L {readLabel}".format( cigar=cigar_string, vA=path_to_vanillaAlign, model=stateMachineType_flag, banded=banded_flag, ref=self.reference, readLabel=read_label, npRead=temp_np_read, t_model=template_model_flag, c_model=complement_model_flag, t_hmm=template_hmm_flag, c_hmm=complement_hmm_flag, posteriors=posteriors_file_path, thresh=threshold_flag, expansion=diag_expansion_flag, trim=trim_flag, ) ) # run print("signalAlign - running command: ", command, end="\n", file=sys.stderr) os.system(command) temp_folder.remove_folder() return True