def do_astral(input, output): astralpath = getlocalpath() if not os.path.exists(output): os.mkdir(output) out_name = "combine.tree" consensuseCmd = "java -jar " + astralpath + "/astral.5.6.3.jar" + " -i " + input + " -o " + output + "/" + out_name subprocess.call(consensuseCmd, shell=True)
def do_supertree(input, output): supertreepath = getlocalpath() if not os.path.exists(output): os.mkdir(output) out_name = "spr_supertree.tree" consensuseCmd = supertreepath + "/spr_supertree" + " < " + input + " > " + output + "/" + out_name subprocess.call(consensuseCmd, shell=True)
def domuscle_file(indata_files, outdata, musclepara): """ call muscle software to do align :param indata_files: a directory contain more than one file :param outdata: out file after alignment :return: path """ muscleparas = musclepara.lstrip() mupath = getlocalpath() out_path = os.path.dirname(outdata) timeformat = '%Y%m%d%H%M%S' timeinfo = str(time.strftime(timeformat)) subdir = 'temp/alignment' + timeinfo muscle_dir = os.path.join(out_path, subdir) # muscle_dir = os.path.join(indata_files, 'muscle_alignment') pro_name = os.listdir(indata_files) if not os.path.exists(muscle_dir): os.makedirs(muscle_dir) for i in pro_name: out_alg = os.path.join(muscle_dir, i.split('.')[0]) each_pro = os.path.join(indata_files, i) cmd = mupath + "/muscle -in " + each_pro + " -out " + out_alg + " " + muscleparas subprocess.call(cmd, shell=True) logdomuscle.info("Multiple sequence alignment by Muscle was completed.") return muscle_dir
def domafft_file(indata_files, outdata, mafftparas, thread): """ call mafft software to do align :param indata_files: a directory contain more than one file :param outdata: out file after alignment :return: path """ mafftparas = mafftparas.lstrip() mapath = getlocalpath() out_path = os.path.dirname(outdata) timeformat = '%Y%m%d%H%M%S' timeinfo = str(time.strftime(timeformat)) subdir = 'temp/alignment' + timeinfo mafft_dir = os.path.join(out_path, subdir) # mafft_dir = os.path.join(indata_files, 'muscle_alignment') pro_name = os.listdir(indata_files) if not os.path.exists(mafft_dir): os.makedirs(mafft_dir) all_cmd = [] for i in pro_name: out_alg = os.path.join(mafft_dir, i.split('.')[0]) each_pro = os.path.join(indata_files, i) cmd = mapath + "/mafft " + mafftparas + " " + each_pro + " > " + out_alg # subprocess.call(cmd, shell=True) all_cmd.append(cmd) pool = multiprocessing.Pool(processes=thread) # method 1: map pool.map(run_cmd, all_cmd) logmafft.info("Multiple sequence alignment by mafft was completed.") return mafft_dir
def dogblocks(indata, gblockpara): """ do gblocks after muslce and concatenate :param indata: a fasta file input after gblock :param gblockpara: the gblocks para :return: a file path of gblocks result """ # Deal with outdata name gblockparas = gblockpara.lstrip() gblockparalist = gblockparas.split(" ") regex = '-e=' for i in range(0, len(gblockparalist)): if re.search(regex, gblockparalist[i]): index = i break outnamepara = gblockparalist[index] outdata = outnamepara.split('=')[1] gblockpath = getlocalpath() alg_name = os.path.basename(indata) out_path = os.path.dirname(indata) gblock_name = alg_name + outdata gblock_data = os.path.join(out_path, gblock_name) cmd = gblockpath + "/Gblocks " + indata + " " + gblockparas subprocess.call(cmd, shell=True) loggblocks.info('Select conserved blocks by Gblocks was completed') loggblocks.debug('Gblocks path:{0}'.format(gblock_data)) return gblock_data
def domafft_file(indata_files, outdata, mafftparas): """ call mafft software to do align :param indata_files: a directory contain more than one file :param outdata: out file after alignment :return: path """ mafftparas = mafftparas.lstrip() mapath = getlocalpath() out_path = os.path.dirname(outdata) timeformat = '%Y%m%d%H%M%S' timeinfo = str(time.strftime(timeformat)) subdir = 'temp/alignment' + timeinfo mafft_dir = os.path.join(out_path, subdir) # mafft_dir = os.path.join(indata_files, 'muscle_alignment') pro_name = os.listdir(indata_files) if not os.path.exists(mafft_dir): os.makedirs(mafft_dir) for i in pro_name: out_alg = os.path.join(mafft_dir, i.split('.')[0]) each_pro = os.path.join(indata_files, i) cmd = mapath + "/mafft " + mafftparas + " " + each_pro + " > " + out_alg subprocess.call(cmd, shell=True) logmafft.info("Multiple sequence alignment by mafft was completed.") return mafft_dir
def doraxml(inputfile, outputfile, raxmlpara, thread): """ call RAxML method to construct species tree :param inputfile: abs path of .phy format files :param outputfile: a file contain RAxML result """ raxmlparas = raxmlpara.lstrip() raxmlparalist = raxmlparas.split(" ") tpara = '-T' if tpara in raxmlparalist: index = raxmlparalist.index(tpara) raxmlparalist.remove(raxmlparalist[index]) raxmlparalist.remove(raxmlparalist[index]) raxmlpararet = ' '.join(raxmlparalist) else: raxmlpararet = raxmlpara threadtostr = str(thread) raxmlpath = getlocalpath() if not os.path.exists(outputfile): os.mkdir(outputfile) strs = raxmlpath + "/raxmlHPC-PTHREADS-AVX " + "-T " + threadtostr + " " + raxmlpararet # cmd command cmd = strs + " -s " + inputfile + " -w " + outputfile subprocess.call(cmd, shell=True) logdoraxml.info("Phylogenetic species tree reconstructed by RAxML was completed")
def doclustalw_file(indata_files, outdata, clustalwpara): """ Call clustalw software to do align :param indata_files: a directory contain more than one file :param outdata: out file after alignment :return: path """ type = "-TYPE=PROTEIN" if clustalwpara is None: clustalwparas = type else: clustalwparas = type + " " + clustalwpara.lstrip() clu_path = getlocalpath() out_path = os.path.dirname(outdata) doclu_subdir = str(timeformat('temp/hcp_alignment')) clustalw_dir = os.path.join(out_path, doclu_subdir) pro_name = os.listdir(indata_files) if not os.path.exists(clustalw_dir): os.makedirs(clustalw_dir) for i in pro_name: each_pro = os.path.join(indata_files, i) out_file = "-OUTFILE=" + os.path.join(clustalw_dir, i) cmd = clu_path + "/clustalw2 " + "-INFILE=" + each_pro + " -OUTPUT=FASTA -ALIGN " + out_file + " " + clustalwparas subprocess.call(cmd, shell=True) logdoclustalw.info("Multiple sequence alignment by Clustalw2 was completed.") return clustalw_dir
def dotrimal(indata, trimalpara): """ do trimal after muslce and concatenate :param indata: a fasta file input to do trimal :param outdata: append name after :return: a file path of trimal result """ # Deal with outdata name trimalparas = trimalpara.lstrip() trimalpath = getlocalpath() out_path = os.path.dirname(indata) trimal_name = "trimal.phy" trimal_data = os.path.join(out_path, trimal_name) cmd = trimalpath + "/trimal " + " -in " + indata + " -out " + trimal_data + " " + trimalparas + " -phylip" subprocess.call(cmd, shell=True) # support fasttree software trimal_data2 = trimal_data.replace('.phy', '') cmd2 = trimalpath + "/trimal " + " -in " + indata + " -out " + trimal_data2 + " " + trimalparas + " -fasta" subprocess.call(cmd2, shell=True) loggtrimal.info('Select conserved blocks by trimal was completed') loggtrimal.debug('trimal path:{0}'.format(trimal_data)) return trimal_data
def doclustalw_file(indata_files, outdata, clustalwpara): """ Call clustalw software to do align :param indata_files: a directory contain more than one file :param outdata: out file after alignment :return: path """ type = "-TYPE=PROTEIN" if clustalwpara is None: clustalwparas = type else: clustalwparas = type + " " + clustalwpara.lstrip() clu_path = getlocalpath() out_path = os.path.dirname(outdata) doclu_subdir = str(timeformat('temp/hcp_alignment')) clustalw_dir = os.path.join(out_path, doclu_subdir) pro_name = os.listdir(indata_files) if not os.path.exists(clustalw_dir): os.makedirs(clustalw_dir) for i in pro_name: each_pro = os.path.join(indata_files, i) out_file = "-OUTFILE=" + os.path.join(clustalw_dir, i) cmd = clu_path + "/clustalw2 " + "-INFILE=" + each_pro + " -OUTPUT=FASTA -ALIGN " + out_file + " " + clustalwparas subprocess.call(cmd, shell=True) logdoclustalw.info( "Multiple sequence alignment by Clustalw2 was completed.") return clustalw_dir
def docontree(input, output, rule): """ Combine tree :param input: input files :param output: output directory """ # get raxml path raxmlpath = getlocalpath() # run # prepare a dir store result if not os.path.exists(output): os.mkdir(output) consensuseCmd = raxmlpath + "/raxmlHPC-PTHREADS-AVX " + " -J " + rule + " -m GTRCAT -z " + input + " -w " + output + " -n T1" subprocess.call(consensuseCmd, shell=True)
def doiqtree(inputfile, outputfile, iqtreepara, thread): # Use FASTA format build tree # input_fasta = inputfile.replace('.phy', '') iqtreePath = getlocalpath() thread_to_str = str(thread) out_tree_name = os.path.join(outputfile, "iqtree.tree") if not os.path.exists(outputfile): os.mkdir(outputfile) if thread_to_str is '1': cmd = iqtreePath + "/iqtree " + "-s " + inputfile + " -pre " + out_tree_name + iqtreepara subprocess.call(cmd, shell=True) else: # set the threads cmd = iqtreePath + "/iqtree " + "-s " + inputfile + " -pre " + out_tree_name + " -nt " + thread_to_str + iqtreepara subprocess.call(cmd, shell=True) logdoiqtree.info("Phylogenetic species tree reconstructed by iqtree was completed")
def doFastTree(inputfile, outputfile, FastTreepara, thread): # Use FASTA format build tree input_fasta = inputfile.replace('.phy', '') FastTreePath = getlocalpath() thread_to_str = str(thread) out_tree_name = os.path.join(outputfile, "FastTree.tree") if not os.path.exists(outputfile): os.mkdir(outputfile) if thread_to_str is '1': cmd = FastTreePath + "/FastTree " + FastTreepara + input_fasta + " >" + out_tree_name subprocess.call(cmd, shell=True) else: # set the threads os.environ["OMP_NUM_THREADS"] = thread_to_str cmd = FastTreePath + "/FastTreeMP " + FastTreepara + input_fasta + " >" + out_tree_name subprocess.call(cmd, shell=True) logdofasttree.info("Phylogenetic species tree reconstructed by FastTree was completed")
def doiqtree(inputfile, outputfile, iqtreepara, thread): # Use FASTA format build tree # input_fasta = inputfile.replace('.phy', '') iqtreePath = getlocalpath() thread_to_str = str(thread) out_tree_name = os.path.join(outputfile, "iqtree.tree") if not os.path.exists(outputfile): os.mkdir(outputfile) if thread_to_str is '1': cmd = iqtreePath + "/iqtree " + "-s " + inputfile + " -pre " + out_tree_name + iqtreepara subprocess.call(cmd, shell=True) else: # set the threads cmd = iqtreePath + "/iqtree " + "-s " + inputfile + " -pre " + out_tree_name + " -nt " + thread_to_str + iqtreepara subprocess.call(cmd, shell=True) logdoiqtree.info( "Phylogenetic species tree reconstructed by iqtree was completed")
def doFastTree(inputfile, outputfile, FastTreepara, thread): # Use FASTA format build tree input_fasta = inputfile.replace('.phy', '') FastTreePath = getlocalpath() thread_to_str = str(thread) out_tree_name = os.path.join(outputfile, "FastTree.tree") if not os.path.exists(outputfile): os.mkdir(outputfile) if thread_to_str is '1': cmd = FastTreePath + "/FastTree " + FastTreepara + " " + input_fasta + " >" + out_tree_name subprocess.call(cmd, shell=True) else: # set the threads os.environ["OMP_NUM_THREADS"] = thread_to_str cmd = FastTreePath + "/FastTreeMP " + FastTreepara + " " + input_fasta + " >" + out_tree_name subprocess.call(cmd, shell=True) logdofasttree.info( "Phylogenetic species tree reconstructed by FastTree was completed")
def doclustalw(indata, outdata, clustalwpara): """ Call clustalw software to do align :param indata: a director contain a fasta format file or a fasta format file :param outdata: the out is abs path with a file name :return: outdata path """ logdoclustalw.debug("clustalw input data:{0}".format(indata)) type = "-TYPE=DNA" if clustalwpara is None: clustalwparas = type else: clustalwparas = type + " " + clustalwpara.lstrip() clu_path = getlocalpath() out_path = os.path.dirname(outdata) doclu_subdir = str(timeformat('temp/rna_sequence')) clustalw_dir = os.path.join(out_path, doclu_subdir) # check indata type is dir or files? if os.path.isdir(indata): pro_name = os.listdir(indata) if not os.path.exists(clustalw_dir): os.makedirs(clustalw_dir) out_file = "-OUTFILE=" + os.path.join(clustalw_dir, pro_name[0]) infile = os.path.join(indata, pro_name[0]) cmd = clu_path + "/clustalw2 " + "-INFILE=" + infile + " -OUTPUT=FASTA -ALIGN " + out_file + " " + clustalwparas subprocess.call(cmd, shell=True) logdoclustalw.info( "Multiple sequence alignment by Clustalw2 was completed.") out_alg = os.path.join(clustalw_dir, pro_name[0]) return out_alg # indata is a file elif os.path.isfile(indata): pro_name = indata if not os.path.exists(clustalw_dir): os.makedirs(clustalw_dir) out_file = "-OUTFILE=" + os.path.join(clustalw_dir, pro_name) cmd = clu_path + "/clustalw2 " + "-INFILE=" + pro_name + " -OUTPUT=FASTA -ALIGN " + out_file + " " + clustalwparas subprocess.call(cmd, shell=True) logdoclustalw.info( "Multiple sequence alignment by Clustalw2 was completed.") out_alg = os.path.join(clustalw_dir, pro_name) return out_alg
def doclustalw(indata, outdata, clustalwpara): """ Call clustalw software to do align :param indata: a director contain a fasta format file or a fasta format file :param outdata: the out is abs path with a file name :return: outdata path """ logdoclustalw.debug("clustalw input data:{0}".format(indata)) type = "-TYPE=DNA" if clustalwpara is None: clustalwparas = type else: clustalwparas = type + " " + clustalwpara.lstrip() clu_path = getlocalpath() out_path = os.path.dirname(outdata) doclu_subdir = str(timeformat('temp/rna_sequence')) clustalw_dir = os.path.join(out_path, doclu_subdir) # check indata type is dir or files? if os.path.isdir(indata): pro_name = os.listdir(indata) if not os.path.exists(clustalw_dir): os.makedirs(clustalw_dir) out_file = "-OUTFILE=" + os.path.join(clustalw_dir, pro_name[0]) infile = os.path.join(indata,pro_name[0]) cmd = clu_path + "/clustalw2 " + "-INFILE=" + infile + " -OUTPUT=FASTA -ALIGN " + out_file + " " + clustalwparas subprocess.call(cmd, shell=True) logdoclustalw.info("Multiple sequence alignment by Clustalw2 was completed.") out_alg = os.path.join(clustalw_dir, pro_name[0]) return out_alg # indata is a file elif os.path.isfile(indata): pro_name = indata if not os.path.exists(clustalw_dir): os.makedirs(clustalw_dir) out_file = "-OUTFILE=" + os.path.join(clustalw_dir, pro_name) cmd = clu_path + "/clustalw2 " + "-INFILE=" + pro_name + " -OUTPUT=FASTA -ALIGN " + out_file + " " + clustalwparas subprocess.call(cmd, shell=True) logdoclustalw.info("Multiple sequence alignment by Clustalw2 was completed.") out_alg = os.path.join(clustalw_dir, pro_name) return out_alg
def domuscle(indata, outdata, musclepara): """ call muscle software to do align :param indata: a director contain a fasta format file or a fasta format file :param outdata: the out is abs path with a file name :return: outdata path """ muscleparas = musclepara.lstrip() mupath = getlocalpath() out_path = os.path.dirname(outdata) timeformat = '%Y%m%d%H%M%S' timeinfo = str(time.strftime(timeformat)) subdir = 'temp/rna_alignment' + timeinfo muscle_dir = os.path.join(out_path, subdir) # check indata type if os.path.isdir(indata): pro_name = os.listdir(indata) if not os.path.exists(muscle_dir): os.makedirs(muscle_dir) out_alg = os.path.join(muscle_dir, pro_name[0]) each_pro = os.path.join(indata, pro_name[0]) cmd = mupath + "/muscle -in " + each_pro + " -out " + out_alg + " " + muscleparas subprocess.call(cmd, shell=True) logdomuscle.debug('muscle result path:{0}'.format(out_alg)) logdomuscle.info( "Multiple sequence alignment by Muscle was completed.") return out_alg elif os.path.isfile(indata): pro_name = indata if not os.path.exists(muscle_dir): os.makedirs(muscle_dir) out_alg = os.path.join(muscle_dir, pro_name) each_pro = pro_name cmd = mupath + "/muscle -in " + each_pro + " -out " + out_alg + " " + muscleparas subprocess.call(cmd, shell=True) logdomuscle.debug('muscle result path:{0}'.format(out_alg)) logdomuscle.info( "Multiple sequence alignment by Muscle was completed.") return out_alg
def domafft(indata, outdata, mafftparas): """ call mafft software to do align :param indata: a director contain a fasta format file or a fasta format file :param outdata: the out is abs path with a file name :return: outdata path """ mafftparas = mafftparas.lstrip() mapath = getlocalpath() out_path = os.path.dirname(outdata) timeformat = '%Y%m%d%H%M%S' timeinfo = str(time.strftime(timeformat)) subdir = 'temp/rna_alignment' + timeinfo mafft_dir = os.path.join(out_path, subdir) # check indata type if os.path.isdir(indata): pro_name = os.listdir(indata) if not os.path.exists(mafft_dir): os.makedirs(mafft_dir) out_alg = os.path.join(mafft_dir, pro_name[0]) each_pro = os.path.join(indata, pro_name[0]) cmd = mapath + "/mafft " + mafftparas + " " + each_pro + " > " + out_alg subprocess.call(cmd, shell=True) logmafft.debug('mafft result path:{0}'.format(out_alg)) logmafft.info("Multiple sequence alignment by mafft was completed.") return out_alg elif os.path.isfile(indata): pro_name = indata if not os.path.exists(mafft_dir): os.makedirs(mafft_dir) out_alg = os.path.join(mafft_dir, pro_name) each_pro = pro_name cmd = mapath + "/mafft " + mafftparas + " " + each_pro + " > " + out_alg subprocess.call(cmd, shell=True) logmafft.debug('mafft result path:{0}'.format(out_alg)) logmafft.info("Multiple sequence alignment by mafft was completed.") return out_alg