def runFasta(seq_dir, genomes_dir, fasta_dir): """ Run FASTA on protein sequences between new genome against all in house genomes FASTA searches a protein or DNA sequence data bank version 35.04 Aug. 25, 2009 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 """ util.createDir(fasta_dir) # List of in-house genomes util.checkDir(genomes_dir) genome_files = [] logger.info("Create fasta results directory for each in-house reference genome") for genome_file in os.listdir(genomes_dir): if '.faa' in genome_file: genome_files.append(genome_file) # Create fasta results directory for each in-house genome util.createDir("%s/%s" % (fasta_dir, genome_file.split(".")[0])) logger.info(genome_file) util.checkDir(seq_dir) if IS_LSF: # Rename new genome sequences for job array to be mygenome_1.faa mygenome_2.faa ... seq_num = 0 for seq_file in os.listdir(seq_dir): if not '.faa' in seq_file: continue seq_num += 1 if 'mygenome_' in seq_file and '.faa' in seq_file: continue seq_newfilepath = "%s/mygenome_%s.faa" % (seq_dir, seq_num) seq_filepath = "%s/%s" % (seq_dir, seq_file) os.rename(seq_filepath, seq_newfilepath) # Submit bsub job array on mygenome_${LSB_JOBINDEX}.faa against one refgenome at a time bsub_dir = "bsub" util.checkDir(bsub_dir) for genome_file in genome_files: res_dir = "%s/%s" % (fasta_dir, genome_file.split(".")[0]) cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/mygenome_${LSB_JOBINDEX}.faa %s/%s > %s/mygenome_${LSB_JOBINDEX}.fa" % (seq_dir, genomes_dir, genome_file, res_dir) util.submitJobArray(jobname="genepy-fasta", jobnum=seq_num, jobdir=bsub_dir, cmd=cmd) util.submitJobDependency('genepy-fasta') logger.info("Fasta on LSF finished") else: # List of new genome sequences for seq_file in os.listdir(seq_dir): if not '.faa' in seq_file: continue res_file = seq_file.split(".")[0] + ".fa" for genome_file in genome_files: res_dir = "%s/%s" % (fasta_dir, genome_file.split(".")[0]) cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/%s %s/%s > %s/%s" % (seq_dir, seq_file, genomes_dir, genome_file, res_dir, res_file) util.runProcess(cmd) logger.info(seq_file) logger.info("Fasta finished")
def runReciprocalFasta(seq_dir, genome_file, fasta_dir): """ Run FASTA between extracted in-house protein sequences against new genome FASTA searches a protein or DNA sequence data bank version 35.04 Aug. 25, 2009 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 """ util.createDir(fasta_dir) # Check new genome util.checkFile(genome_file) # Check ref genome extracted sequences util.checkDir(seq_dir) res_dir = fasta_dir if IS_LSF: # Rename new genome sequences for job array to be refgenome_1.faa refgenome_2.faa ... seq_num = 0 for seq_file in os.listdir(seq_dir): if not '.faa' in seq_file: continue seq_num += 1 if 'refgenome_' in seq_file and '.faa' in seq_file: continue seq_newfilepath = "%s/refgenome_%s.faa" % (seq_dir, seq_num) seq_filepath = "%s/%s" % (seq_dir, seq_file) os.rename(seq_filepath, seq_newfilepath) # Submit bsub job array on refgenome_${LSB_JOBINDEX}.faa against mygenome bsub_dir = "bsub" util.checkDir(bsub_dir) cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/refgenome_${LSB_JOBINDEX}.faa %s > %s/refgenome_${LSB_JOBINDEX}.fa" % (seq_dir, genome_file, res_dir) util.submitJobArray(jobname="genepy-recipfasta", jobnum=seq_num, jobdir=bsub_dir, cmd=cmd) util.submitJobDependency('genepy-recipfasta') logger.info("Reciprocal Fasta on LSF finished") else: # List of inhouse extracted genome sequences for seq_file in os.listdir(seq_dir): if not '.faa' in seq_file: continue res_file = seq_file.split(".")[0] + ".fa" cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/%s %s > %s/%s" % (seq_dir, seq_file, genome_file, res_dir, res_file) util.runProcess(cmd) logger.info(seq_file) logger.info("Reciprocal Fasta finished")
def runHamapScan(seq_dir, hamap_dir): """ HAMAP: High-quality Automated and Manual Annotation of microbial Proteomes ftp download site: ftp://ftp.expasy.org/databases/hamap/ pfscan compares a protein or nucleic acid sequence against a profile library. The result is an unsorted list of profile-sequence matches. download site: http://www.isrec.isb-sib.ch/ftp-server/pftools/pft2.3/ """ util.createDir(hamap_dir) util.checkDir(seq_dir) hamap_profile_file = "%s/hamap/hamap.prf" % os.path.dirname(__file__) if IS_LSF: # Rename new genome sequences for job array to be mygenome_1.faa mygenome_2.faa ... seq_num = 0 for seq_file in os.listdir(seq_dir): if not '.faa' in seq_file: continue seq_num += 1 if 'mygenome_' in seq_file and '.faa' in seq_file: continue seq_newfilepath = "%s/mygenome_%s.faa" % (seq_dir, seq_num) seq_filepath = "%s/%s" % (seq_dir, seq_file) os.rename(seq_filepath, seq_newfilepath) # Submit bsub job array on mygenome_${LSB_JOBINDEX}.faa against hamap profile bsub_dir = "bsub" util.checkDir(bsub_dir) cmd = "pfscan -klf %s/mygenome_${LSB_JOBINDEX}.faa %s > %s/mygenome_${LSB_JOBINDEX}.out" % (seq_dir, hamap_profile_file, hamap_dir) util.submitJobArray(jobname='genepy-hamap', jobnum=seq_num, jobdir=bsub_dir, cmd=cmd) util.submitJobDependency('genepy-hamap') logger.info("HAMAP scan on LSF finished") else: # List of new genome sequences for seq_file in os.listdir(seq_dir): if not '.faa' in seq_file: continue res_file = seq_file.split(".")[0] + ".out" cmd = "pfscan -klf %s/%s %s > %s/%s" % (seq_dir, seq_file, hamap_profile_file, hamap_dir, res_file) util.runProcess(cmd) logger.info("HAMAP scan finished")