def batch_for_batch(): os.chdir("/home/zerodel/Workspace/mouse/FullLengthMouse") model_gtr = "rebuild_model.mdl" model_nest = "rna_structure_full_mouse.mdl" aln_path = "/home/zerodel/Workspace/mouse/mouse_8_species" bf_maker_nest = BfH.HYPHYBatchFile(species_name="mouse", model_file=model_nest, bf_template_file="template_global") bf_maker_gtr = BfH.HYPHYBatchFile(species_name="mouse", model_file=model_gtr, bf_template_file="template_global") jobids = [file1.split(".")[0] for file1 in os.listdir(aln_path) if ".aln" == file1[-4:]] for job_id in jobids: aln_file_path_full = os.path.join(aln_path, job_id + ".aln") input_file = os.path.join(os.curdir, job_id + ".input") if not os.path.exists(input_file): DH.aln2input(aln_file_path_full, input_file) bfgtr = job_id + "gtr.bf" bf_maker_gtr.write_batch_file( dot_input=job_id + ".input", dot_aln=aln_file_path_full, hyphy_result_file=job_id + "gtr.result", hyphy_batch_file=bfgtr, ) bfnest = job_id + "nest.bf" bf_maker_nest.write_batch_file( dot_input=job_id + ".input", dot_aln=aln_file_path_full, hyphy_result_file=job_id + "nest.result", hyphy_batch_file=bfnest, )
def test_aln_to_no_gap_input(self): try: dh.check_sequence_matrix(["aaca", "aaa"]) except dh.sequenceNotSameLength: pass else: self.fail("here should raise a exception!") dh.aln2inputNogap("./tmp.aln","./tmp2.input")
def test_aln_to_no_gap_input(self): try: dh.check_sequence_matrix(["aaca", "aaa"]) except dh.sequenceNotSameLength: pass else: self.fail("here should raise a exception!") dh.aln2inputNogap("./tmp.aln", "./tmp2.input")
def make_bf_p2(): workpath = "d:\Workspace\Ecoli\P2" os.chdir(workpath) model_gtr = "rebuild_model.mdl" model_nest = "rna_full_length_structure.mdl" bf_maker_gtr = BfH.HYPHYBatchFile(species_name="ecoli", model_file=model_gtr, bf_template_file="templateNoGap") bf_maker_nest = BfH.HYPHYBatchFile(species_name="ecoli", model_file=model_nest, bf_template_file="templateNoGap") bf_maker_gtr.set_tree_from_outside(SS.ecoli()) bf_maker_nest.set_tree_from_outside(SS.ecoli()) aln_file_folder = "d:\Workspace\Ecoli\ecoli_10_species" aln_files = [file_single for file_single in os.listdir(aln_file_folder) if ".aln" == os.path.splitext(file_single)[-1]] for single_aln in aln_files: aln_full_path = os.path.join(aln_file_folder, single_aln) genes, lengths = DH.aln_info(aln_full_path) gene_full_length = lengths[0] jobid = single_aln.split(".")[0] input_file_name = "%s.input" % jobid DH.aln2input(aln_full_path, input_file_name) if gene_full_length < 52: print "%s too short ---" % single_aln continue bf_maker_gtr.set_partition(51, gene_full_length) bf_maker_nest.set_partition(51, gene_full_length) bf_maker_gtr.write_batch_file(dot_input=input_file_name, dot_aln="", hyphy_batch_file="%sp2gtr.bf" % jobid, hyphy_result_file="%sp2gtr.result" % jobid) bf_maker_nest.write_batch_file(dot_input=input_file_name, dot_aln="", hyphy_batch_file="%sp2nest.bf" % jobid, hyphy_result_file="%sp2nest.result" % jobid)
def write_batch_file(self, dot_input, dot_aln, hyphy_batch_file="", hyphy_result_file=""): """ write a .bf file for a alignment""" gene_id = os.path.basename(dot_input).split(os.path.extsep)[0] path_main = os.path.splitext(dot_input)[0] if "" == hyphy_batch_file: hyphy_batch_file = path_main + ".bf" if "" == hyphy_result_file: hyphy_result_file = path_main + ".result" if "" == self.batch_content: raise BFError # replace begins here batch_content, num_hits = re.subn(self.f_input, dot_input, self.batch_content) self._error_no_hit(num_hits) # partition is optional if (0, 0) == self.partition: if self.f_partition in batch_content: batch_content, num_hits = re.subn(self.f_partition, "", batch_content) self._error_no_hit(num_hits) else: batch_content, num_hits = re.subn(self.f_partition, "%d-%d" % self.partition, batch_content) self._error_no_hit(num_hits) batch_content, num_hits = re.subn(self.f_mdl, self.mdl_file, batch_content) self._error_no_hit(num_hits) # only support 1 matrix now :2014-5-26 batch_content, num_hits = re.subn(self.f_matrix_name, self.matrix_name[0], batch_content) self._error_no_hit(num_hits) if self.use_given_tree: tree_newick_string = self.tree_definition_external else: genes_share_aln = pHdata.aln_reader(dot_aln) tree_newick_string = self.build_tree(genes_share_aln) batch_content, num_hits = re.subn(self.f_tree, tree_newick_string, batch_content) self._error_no_hit(num_hits) batch_content, num_hits = re.subn(self.f_output, hyphy_result_file, batch_content) self._error_no_hit(num_hits) self.check_whether_incomplete(batch_content) with open(name=hyphy_batch_file, mode="w") as bf_writer: bf_writer.write(batch_content)
def check_gene_order_in_alignments(path_to_alignments): """ 判断path_to_alignment 里面所有的.aln 文件里物种名是否都是一样的. :param path_to_alignments: :return: """ # get file-names import os import os.path current_dir = os.path.abspath(os.curdir) os.chdir(path_to_alignments) aln_files = [single_file for single_file in os.listdir(path_to_alignments) if ".aln" == os.path.splitext(single_file)[-1]] try: import pyHYPHY.DataHYPHY as dh except ImportError: print "error in importing pyHYPHY module" return aln_genes_calibration = dh.aln_info(aln_files[0])[0] gene_un_match = 0 for aln_entry in aln_files: if not aln_genes_calibration == dh.aln_info(aln_entry)[0]: print aln_entry, ":---- ", str(dh.aln_info(aln_entry)[0]), "\n" gene_un_match += 1 print "whole number of unmatch file is %d \n and pattern is :\n %s \n" % (gene_un_match, aln_genes_calibration) os.chdir(current_dir) if 0 == gene_un_match: return aln_genes_calibration else: raise AlignmentNotSame
def batch_for_batch(): os.chdir("/home/zerodel/Workspace/mouse/FullLengthMouse") model_gtr = "rebuild_model.mdl" model_nest = "rna_structure_full_mouse.mdl" aln_path = "/home/zerodel/Workspace/mouse/mouse_8_species" bf_maker_nest = BfH.HYPHYBatchFile(species_name="mouse", model_file=model_nest, bf_template_file="template_global") bf_maker_gtr = BfH.HYPHYBatchFile(species_name="mouse", model_file=model_gtr, bf_template_file="template_global") jobids = [ file1.split(".")[0] for file1 in os.listdir(aln_path) if ".aln" == file1[-4:] ] for job_id in jobids: aln_file_path_full = os.path.join(aln_path, job_id + ".aln") input_file = os.path.join(os.curdir, job_id + ".input") if not os.path.exists(input_file): DH.aln2input(aln_file_path_full, input_file) bfgtr = job_id + "gtr.bf" bf_maker_gtr.write_batch_file(dot_input=job_id + ".input", dot_aln=aln_file_path_full, hyphy_result_file=job_id + "gtr.result", hyphy_batch_file=bfgtr) bfnest = job_id + "nest.bf" bf_maker_nest.write_batch_file(dot_input=job_id + ".input", dot_aln=aln_file_path_full, hyphy_result_file=job_id + "nest.result", hyphy_batch_file=bfnest)
def check_aln_full_species(folder_alns): all_alns = [single_file for single_file in os.listdir(folder_alns) if single_file[-4:] == ".aln"] aln_gene_num = [] for single_file in all_alns: full_aln_path = os.path.join(folder_alns, single_file) genes, gene_lengths = dh.aln_info(full_aln_path) aln_gene_num.append(len(genes)) max_num = max(aln_gene_num) aln_full_gene = [] for indexI, aln in enumerate(all_alns): if aln_gene_num[indexI] == max_num: aln_full_gene.append(all_alns[indexI]) return aln_full_gene
def check_aln_full_species(folder_alns): all_alns = [ single_file for single_file in os.listdir(folder_alns) if single_file[-4:] == ".aln" ] aln_gene_num = [] for single_file in all_alns: full_aln_path = os.path.join(folder_alns, single_file) genes, gene_lengths = dh.aln_info(full_aln_path) aln_gene_num.append(len(genes)) max_num = max(aln_gene_num) aln_full_gene = [] for indexI, aln in enumerate(all_alns): if aln_gene_num[indexI] == max_num: aln_full_gene.append(all_alns[indexI]) return aln_full_gene
def check_aln_files(): """ check which .aln file contains gene of all 10 species :return: """ folder_alns = "/media/zerodel/Home/Work/custom/ecoli_aln" all_alns = [single_file for single_file in os.listdir(folder_alns) if single_file[-4:] == ".aln"] aln_gene_num = [] for single_file in all_alns: full_aln_path = os.path.join(folder_alns, single_file) genes, gene_lengths = DataHyPHY.aln_info(full_aln_path) aln_gene_num.append(len(genes)) max_num = max(aln_gene_num) aln_full_gene = [] for indexI, aln in enumerate(all_alns): if aln_gene_num[indexI] == max(aln_gene_num): aln_full_gene.append(all_alns[indexI]) print "fulllength has", len(aln_full_gene), "with ", str(max_num), "genes"
def check_aln_files(): """ check which .aln file contains gene of all 10 species :return: """ folder_alns = "/media/zerodel/Home/Work/custom/ecoli_aln" all_alns = [ single_file for single_file in os.listdir(folder_alns) if single_file[-4:] == ".aln" ] aln_gene_num = [] for single_file in all_alns: full_aln_path = os.path.join(folder_alns, single_file) genes, gene_lengths = DataHyPHY.aln_info(full_aln_path) aln_gene_num.append(len(genes)) max_num = max(aln_gene_num) aln_full_gene = [] for indexI, aln in enumerate(all_alns): if aln_gene_num[indexI] == max(aln_gene_num): aln_full_gene.append(all_alns[indexI]) print "fulllength has", len(aln_full_gene), "with ", str(max_num), "genes"
def test_something(self): raw_sequence = "aca----gt" gap_removed = "aca" self.assertEqual(gap_removed, dh.remove_gaps(raw_sequence))
def test_remove_gap_matrix(self): raw_matrix = ["aaa--a", "bbbcac"] fileterd_matrix = ["aaa", "bbb"] self.assertEqual(fileterd_matrix, dh.remove_gaps_matrix(raw_matrix))
current_dir = os.path.abspath(os.curdir) os.chdir(path_aln_file) species_list = check_gene_order_in_alignments(path_aln_file) inputfile_header = [">%s" % species_name for species_name in species_list] matrix_sequence = ["" for species_name in species_list] aln_files = [single_file for single_file in os.listdir(path_aln_file) if ".aln" == os.path.splitext(single_file)[-1]] for single_aln_file in aln_files: # single file operation # rejection : 1. length not enough 2 two many gaps try: <<<<<<< HEAD matrix_sequence = paste_matrix(matrix_sequence,DH.remove_gaps_matrix(extract_TIR_single_file(single_aln_file, length_of_TIR, start_point))) ======= if remove_gap: gene_seq_matrix_addition = DH.remove_gaps_matrix(extract_TIR_single_file(single_aln_file, length_of_TIR, start_point)) else: gene_seq_matrix_addition = extract_TIR_single_file(single_aln_file, length_of_TIR, start_point) matrix_sequence = paste_matrix(matrix_sequence, gene_seq_matrix_addition) >>>>>>> f0acd743d1106c96b88083b2df2cb3526b388aec except SequenceTooShort: print "Too short in %s" % single_aln_file continue except DimNotSame: print "Error of Dimisions %s" % single_aln_file
def aln_folder_traversal(folder_name): """ """ built_in_gy94mdl = "/usr/lib/hyphy/TemplateBatchFiles/TemplateModels/GY94.mdl" gy94bf = bfHYPHY.HYPHYBatchFile(species_name="ecoli", model_file=built_in_gy94mdl, bf_template_file="partition.bf") own_model = "nest_gy.mdl" nested_model = bfHYPHY.HYPHYBatchFile(species_name="ecoli", model_file=own_model, bf_template_file="partition.bf") nt_nest_model = "nt_nest_gy.mdl" bf_nt_nest = bfHYPHY.HYPHYBatchFile(species_name="ecoli", model_file=nt_nest_model, bf_template_file="partition.bf") gu_model = "myCodonMatrix.def" gu_bf = bfHYPHY.HYPHYBatchFile(species_name="ecoli", model_file=gu_model, bf_template_file="synAlphaWPsiModelP.bf") pwd = os.path.abspath(folder_name) os.chdir(folder_name) aln_files = [file1 for file1 in os.listdir(pwd) if "aln" == file1.split(".")[-1]] for index, aln_name in enumerate(aln_files): # write batch file for each aln gene_id = aln_name.split(".")[0] genes, gene_len = dataHYPHY.aln_info(aln_name) len_gene = max(gene_len) gy94bf1 = "%sgy94.bf" % gene_id input_filename = "%s.input" % gene_id # input file here dataHYPHY.aln2input(dot_aln_file=aln_name, hyphy_input_file=input_filename) # gy model , full length gy94bf.write_batch_file(dot_aln=aln_name, dot_input=input_filename, hyphy_batch_file=gy94bf1, hyphy_result_file="gy94%s_%s.result" % (gene_id, "")) # gy model , part 1 gy94p1_name = "%sgy94p1.bf" % gene_id gy94bf.set_partition(0, 60) gy94bf.write_batch_file(dot_aln=aln_name, dot_input=input_filename, hyphy_batch_file=gy94p1_name, hyphy_result_file="gy94p1%s_%s.result" % (gene_id, "")) # gy model , part 2 gy94p2_name = "%sgy94p2.bf" % gene_id gy94bf.set_partition(60, len_gene) gy94bf.write_batch_file(dot_aln=aln_name, dot_input=input_filename, hyphy_batch_file=gy94p2_name, hyphy_result_file="gy94p2%s_%s.result" % (gene_id, "")) # nested model , full length nested_model.write_batch_file(dot_aln=aln_name, dot_input=input_filename, hyphy_result_file="nest_gy%s.result" % gene_id, hyphy_batch_file="%snest_gy.bf" % gene_id) # nested model , part 1 nested_model.set_partition(0, 60) nested_model.write_batch_file(dot_aln=aln_name, dot_input=input_filename, hyphy_result_file="nest_gy%s_p1.result" % gene_id, hyphy_batch_file="%snest_gyp1.bf" % gene_id) # nested model , part 2 nested_model.set_partition(60, len_gene) nested_model.write_batch_file(dot_aln=aln_name, dot_input=input_filename, hyphy_result_file="nest_gy%s_p2.result" % gene_id, hyphy_batch_file="%snest_gyp2.bf" % gene_id) # nt nested ,full bf_nt_nest.write_batch_file(dot_aln=aln_name, dot_input=input_filename, hyphy_result_file="%snt_nest.result" % gene_id, hyphy_batch_file="%snt_nest.bf" % gene_id) # nt nested , part1 bf_nt_nest.set_partition(0, 60) bf_nt_nest.write_batch_file(dot_aln=aln_name, dot_input=input_filename, hyphy_result_file="%snt_nest_p1.result" % gene_id, hyphy_batch_file="%snt_nest_p1.bf" % gene_id) # nt nested , part2 bf_nt_nest.set_partition(60, len_gene) bf_nt_nest.write_batch_file(dot_aln=aln_name, dot_input=input_filename, hyphy_result_file="%snt_nest_p2.result" % gene_id, hyphy_batch_file="%snt_nest_p2.bf" % gene_id) # gu model . full length gu_bf.write_batch_file(dot_aln=aln_name, dot_input=input_filename, hyphy_result_file="%sgu.result" % gene_id, hyphy_batch_file="%sgu.bf" % gene_id) gu_bf.set_partition(0, 60) gu_bf.write_batch_file(dot_aln=aln_name, dot_input=input_filename, hyphy_result_file="%sgup1.result" % gene_id, hyphy_batch_file="%sgup1.bf" % gene_id) gu_bf.set_partition(60 ,len_gene) gu_bf.write_batch_file(dot_aln=aln_name, dot_input=input_filename, hyphy_result_file="%sgup2.result" % gene_id, hyphy_batch_file="%sgup2.bf" % gene_id)
def main(): # main part aln_files_folder = "d:/Workspace/Ecoli/ecoli_10_species" #aln_files_folder = "d:/Workspace/Ecoli/test" target_path = "d:/Workspace/Ecoli/NoGapP1" aln_files = [single_file for single_file in os.listdir(aln_files_folder) if ".aln" == os.path.splitext(single_file)[-1]] model_gtr = "rebuild_model.mdl" model_nest = "rna_full_length_structure.mdl" job_ids = [] seq_length = [] for aln in aln_files: aln_path_full = os.path.join(aln_files_folder, aln) gene_in_aln, length_in_aln = dh.aln_info(aln_path_full) seq_length.append(length_in_aln[0]) job_id = aln.split(".")[0] if 10 == len(gene_in_aln): # only those gene shared in 10 species job_ids.append(job_id) input_file_path_full = os.path.join(target_path, job_id + ".input") shutil.copyfile(aln_path_full, os.path.join(target_path, aln)) # write input file dh.aln2inputNogap(aln_path_full, input_file_path_full) # make bf file for full length no gap gtr and empirical model os.chdir(target_path) bf_maker_nest = BfH.HYPHYBatchFile(species_name="ecoli", model_file=model_nest, bf_template_file="templateNoGap") bf_maker_gtr = BfH.HYPHYBatchFile(species_name="ecoli", model_file=model_gtr, bf_template_file="templateNoGap") for job_id in job_ids: # aln_path_full = os.path.join(aln_files_folder,aln) # input_file_path_full = os.path.join(target_path, job_id + ".input") # shutil.copyfile(aln_path_full, os.path.join(target_path,aln)) # # write input file # dh.aln2inputNogap(aln_path_full, input_file_path_full) # make bf file for full length no gap gtr and empirical model bfgtr = job_id + "gtr.bf" bf_maker_gtr.write_batch_file(dot_input=job_id + ".input", dot_aln=job_id + ".aln", hyphy_result_file=job_id + "gtr.result", hyphy_batch_file=bfgtr) bfnest = job_id + "nest.bf" bf_maker_nest.write_batch_file(dot_input=job_id + ".input", dot_aln=job_id + ".aln", hyphy_result_file=job_id + "nest.result", hyphy_batch_file=bfnest) bf_maker_gtr.set_partition(0, 51) bf_maker_nest.set_partition(0, 51) for job_id in job_ids: bfgtr = job_id + "gtrp1.bf" bf_maker_gtr.write_batch_file(dot_input=job_id + ".input", dot_aln=job_id + ".aln", hyphy_result_file=job_id + "gtrp1.result", hyphy_batch_file=bfgtr) bfnest = job_id + "nestp1.bf" bf_maker_nest.write_batch_file(dot_input=job_id + ".input", dot_aln=job_id + ".aln", hyphy_result_file=job_id + "nestp1.result", hyphy_batch_file=bfnest) for indexI, job_id in enumerate(job_ids): bf_maker_gtr.set_partition(51, seq_length[indexI]) bf_maker_nest.set_partition(51, seq_length[indexI]) bfgtr = job_id + "gtrp2.bf" bf_maker_gtr.write_batch_file(dot_input=job_id + ".input", dot_aln=job_id + ".aln", hyphy_result_file=job_id + "gtrp2.result", hyphy_batch_file=bfgtr) bfnest = job_id + "nestp2.bf" bf_maker_nest.write_batch_file(dot_input=job_id + ".input", dot_aln=job_id + ".aln", hyphy_result_file=job_id + "nestp2.result", hyphy_batch_file=bfnest)
def main(): # main part aln_files_folder = "d:/Workspace/Ecoli/ecoli_10_species" #aln_files_folder = "d:/Workspace/Ecoli/test" target_path = "d:/Workspace/Ecoli/NoGapP1" aln_files = [ single_file for single_file in os.listdir(aln_files_folder) if ".aln" == os.path.splitext(single_file)[-1] ] model_gtr = "rebuild_model.mdl" model_nest = "rna_full_length_structure.mdl" job_ids = [] seq_length = [] for aln in aln_files: aln_path_full = os.path.join(aln_files_folder, aln) gene_in_aln, length_in_aln = dh.aln_info(aln_path_full) seq_length.append(length_in_aln[0]) job_id = aln.split(".")[0] if 10 == len(gene_in_aln): # only those gene shared in 10 species job_ids.append(job_id) input_file_path_full = os.path.join(target_path, job_id + ".input") shutil.copyfile(aln_path_full, os.path.join(target_path, aln)) # write input file dh.aln2inputNogap(aln_path_full, input_file_path_full) # make bf file for full length no gap gtr and empirical model os.chdir(target_path) bf_maker_nest = BfH.HYPHYBatchFile(species_name="ecoli", model_file=model_nest, bf_template_file="templateNoGap") bf_maker_gtr = BfH.HYPHYBatchFile(species_name="ecoli", model_file=model_gtr, bf_template_file="templateNoGap") for job_id in job_ids: # aln_path_full = os.path.join(aln_files_folder,aln) # input_file_path_full = os.path.join(target_path, job_id + ".input") # shutil.copyfile(aln_path_full, os.path.join(target_path,aln)) # # write input file # dh.aln2inputNogap(aln_path_full, input_file_path_full) # make bf file for full length no gap gtr and empirical model bfgtr = job_id + "gtr.bf" bf_maker_gtr.write_batch_file(dot_input=job_id + ".input", dot_aln=job_id + ".aln", hyphy_result_file=job_id + "gtr.result", hyphy_batch_file=bfgtr) bfnest = job_id + "nest.bf" bf_maker_nest.write_batch_file(dot_input=job_id + ".input", dot_aln=job_id + ".aln", hyphy_result_file=job_id + "nest.result", hyphy_batch_file=bfnest) bf_maker_gtr.set_partition(0, 51) bf_maker_nest.set_partition(0, 51) for job_id in job_ids: bfgtr = job_id + "gtrp1.bf" bf_maker_gtr.write_batch_file(dot_input=job_id + ".input", dot_aln=job_id + ".aln", hyphy_result_file=job_id + "gtrp1.result", hyphy_batch_file=bfgtr) bfnest = job_id + "nestp1.bf" bf_maker_nest.write_batch_file(dot_input=job_id + ".input", dot_aln=job_id + ".aln", hyphy_result_file=job_id + "nestp1.result", hyphy_batch_file=bfnest) for indexI, job_id in enumerate(job_ids): bf_maker_gtr.set_partition(51, seq_length[indexI]) bf_maker_nest.set_partition(51, seq_length[indexI]) bfgtr = job_id + "gtrp2.bf" bf_maker_gtr.write_batch_file(dot_input=job_id + ".input", dot_aln=job_id + ".aln", hyphy_result_file=job_id + "gtrp2.result", hyphy_batch_file=bfgtr) bfnest = job_id + "nestp2.bf" bf_maker_nest.write_batch_file(dot_input=job_id + ".input", dot_aln=job_id + ".aln", hyphy_result_file=job_id + "nestp2.result", hyphy_batch_file=bfnest)