def main(): options = mainUsage() script = options.run checkDoRun(script) try: # Read organism common name and related fasta sequence file list_file = options.list util.checkFile(list_file) for line in open(list_file, "r"): if line[0] == '!': continue if line.count('||') < 1: continue # ! common_name||sequence_file common_name, input_file = util.splitLine(line) util.checkFile(input_file) # Run command cmd = 'python -c "from genepy.annotators.%s import doRun; doRun()" -o %s -i %s' % (script, common_name, input_file) if util.isLsf(): job_name = "%s.%s" % (common_name, ANNOTATOR_EXTENSION[script]) util.submitJob(job_name, cmd, ANNOTATOR_QUEUE[script]) else: util.runProcess(cmd) except Exception, e: log.error(e)
def fasta2embl(infasta): """ Transform sequence file format in fasta to embl using EMBOSS seqret Returns the name of created embl file """ util.checkFile(infasta) outembl = infasta.split(".")[0] + ".embl" """ Usage: seqret Online documentation: http://emboss.open-bio.org/wiki/Appdoc:Seqret Standard (Mandatory) qualifiers: [-sequence] seqall (Gapped) sequence(s) filename and optional format, or reference (input USA) [-outseq] seqoutall [<sequence>.<format>] Sequence set(s) filename and optional format (output USA) The basic USA syntax is one of: "file" "file:entry" "format::file" "format::file:entry" "database:entry" "database" "@file" """ # Create EMBOSS seqret command line cmd = "seqret -sequence fasta::%s -outseq embl::%s " % (infasta, outembl) # Call the subprocess using convenience method util.runProcess(cmd) logger.info("File", outembl, "created") return outembl
def splitSeq(dir, embl, type): """ Split sequence into separate file based on CDS features into dir/ directory based on EMBOSS extractfeat Usage: extractfeat Online documentation: http://emboss.open-bio.org/wiki/Appdoc:Extractfeat Standard (Mandatory) qualifiers: [-sequence] seqall Sequence(s) filename and optional format, or reference (input USA) [-outseq] seqout [.] Sequence filename and optional format (output USA) Additional (Optional) qualifiers: -type string [*] By default every feature in the feature table is extracted. You can set this to be any feature type you wish to extract. See http://www.ebi.ac.uk/Services/WebFeat/ for a list of the EMBL feature types and see the Uniprot user manual in http://www.uniprot.org/manual/sequence_annotation for a list of the Uniprot feature types. The type may be wildcarded by using '*'. If you wish to extract more than one type, separate their names with the character '|', eg: *UTR | intron (Any string is accepted) -featinname boolean [N] To aid you in identifying the type of feature that has been output, the type of feature is added to the start of the description of the output sequence. Sometimes the description of a sequence is lost in subsequent processing of the sequences file, so it is useful for the type to be a part of the sequence ID name. If you set this to be TRUE then the name is added to the ID name of the output sequence. Associated qualifiers: "-outseq" associated qualifiers -ossingle2 boolean Separate file for each entry -ofdirectory2 string Output directory The basic USA syntax is one of: "file" "file:entry" "format::file" "format::file:entry" "database:entry" "database" "@file" """ util.checkFile(embl) # Create directory util.createDir(dir) cmd = "extractfeat -sequence embl::%s -type %s -featinname YES -outseq fasta:: -osextension2 ffn -ossingle2 Yes -osdirectory2 %s" % (embl, type, dir) util.runProcess(cmd) logger.info("Sequences extracted into %s" % dir)
def concatSeq(genome_file, dir): """ Concatenate separated CDS sequence fasta files located in dir into one file """ util.checkDir(dir) if os.path.exists(genome_file): os.remove(genome_file) cmd = "cat %s/*.faa > %s" % (dir, genome_file) util.runProcess(cmd) logger.info("concatSeq finished")
def runFasta(seq_dir, genomes_dir, fasta_dir): """ Run FASTA on protein sequences between new genome against all in house genomes FASTA searches a protein or DNA sequence data bank version 35.04 Aug. 25, 2009 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 """ util.createDir(fasta_dir) # List of in-house genomes util.checkDir(genomes_dir) genome_files = [] logger.info("Create fasta results directory for each in-house reference genome") for genome_file in os.listdir(genomes_dir): if '.faa' in genome_file: genome_files.append(genome_file) # Create fasta results directory for each in-house genome util.createDir("%s/%s" % (fasta_dir, genome_file.split(".")[0])) logger.info(genome_file) util.checkDir(seq_dir) if IS_LSF: # Rename new genome sequences for job array to be mygenome_1.faa mygenome_2.faa ... seq_num = 0 for seq_file in os.listdir(seq_dir): if not '.faa' in seq_file: continue seq_num += 1 if 'mygenome_' in seq_file and '.faa' in seq_file: continue seq_newfilepath = "%s/mygenome_%s.faa" % (seq_dir, seq_num) seq_filepath = "%s/%s" % (seq_dir, seq_file) os.rename(seq_filepath, seq_newfilepath) # Submit bsub job array on mygenome_${LSB_JOBINDEX}.faa against one refgenome at a time bsub_dir = "bsub" util.checkDir(bsub_dir) for genome_file in genome_files: res_dir = "%s/%s" % (fasta_dir, genome_file.split(".")[0]) cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/mygenome_${LSB_JOBINDEX}.faa %s/%s > %s/mygenome_${LSB_JOBINDEX}.fa" % (seq_dir, genomes_dir, genome_file, res_dir) util.submitJobArray(jobname="genepy-fasta", jobnum=seq_num, jobdir=bsub_dir, cmd=cmd) util.submitJobDependency('genepy-fasta') logger.info("Fasta on LSF finished") else: # List of new genome sequences for seq_file in os.listdir(seq_dir): if not '.faa' in seq_file: continue res_file = seq_file.split(".")[0] + ".fa" for genome_file in genome_files: res_dir = "%s/%s" % (fasta_dir, genome_file.split(".")[0]) cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/%s %s/%s > %s/%s" % (seq_dir, seq_file, genomes_dir, genome_file, res_dir, res_file) util.runProcess(cmd) logger.info(seq_file) logger.info("Fasta finished")
def translateSeq(dir): """ Translate nucleic acid sequence in fasta format into protein sequence using EMBOSS transeq Usage: transeq Online documentation: http://emboss.open-bio.org/wiki/Appdoc:Transeq Standard (Mandatory) qualifiers: [-sequence] seqall Nucleotide sequence(s) filename and optional format, or reference (input USA) [-outseq] seqoutall [.] Protein sequence set(s) filename and optional format (output USA) Additional (Optional) qualifiers: -table menu [0] Code to use (Values: 0 (Standard); 1 (Standard (with alternative initiation codons)); 2 (Vertebrate Mitochondrial); 3 (Yeast Mitochondrial); 4 (Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma); 5 (Invertebrate Mitochondrial); 6 (Ciliate Macronuclear and Dasycladacean); 9 (Echinoderm Mitochondrial); 10 (Euplotid Nuclear); 11 (Bacterial); 12 (Alternative Yeast Nuclear); 13 (Ascidian Mitochondrial); 14 (Flatworm Mitochondrial); 15 (Blepharisma Macronuclear); 16 (Chlorophycean Mitochondrial); 21 (Trematode Mitochondrial); 22 (Scenedesmus obliquus); 23 (Thraustochytrium Mitochondrial)) The basic USA syntax is one of: "file" "file:entry" "format::file" "format::file:entry" "database:entry" "database" "@file" """ util.checkDir(dir) for file in os.listdir(dir): if '.ffn' in file: infasta = file outpep = file.split(".")[0] + ".faa" cmd = "transeq -sequence fasta::%s/%s -outseq fasta::%s/%s -table 11" % (dir, infasta, dir, outpep) util.runProcess(cmd) logger.info("Sequences translated.")
def concatFeatures(embl, features): """ Concat CDS features in embl format into embl sequence file - the first two lines of embl sequence containing ID & XX lines - the CDS features file containing FT lines - the rest of embl sequence containing SQ lines Returns the name of created embl sequence file """ util.checkFile(embl) util.checkFile(features) outembl = embl.split(".")[0] + "_with_cds.embl" # Create command line head_cmd = "head -2 %s > %s; cat %s >> %s;" % (embl, outembl, features, outembl) util.runProcess(head_cmd) tail_cmd = "tail +3 %s > tail; cat tail >> %s; rm tail;" % (embl, outembl) util.runProcess(tail_cmd) logger.info("File", outembl, "created") return outembl
def runReciprocalFasta(seq_dir, genome_file, fasta_dir): """ Run FASTA between extracted in-house protein sequences against new genome FASTA searches a protein or DNA sequence data bank version 35.04 Aug. 25, 2009 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 """ util.createDir(fasta_dir) # Check new genome util.checkFile(genome_file) # Check ref genome extracted sequences util.checkDir(seq_dir) res_dir = fasta_dir if IS_LSF: # Rename new genome sequences for job array to be refgenome_1.faa refgenome_2.faa ... seq_num = 0 for seq_file in os.listdir(seq_dir): if not '.faa' in seq_file: continue seq_num += 1 if 'refgenome_' in seq_file and '.faa' in seq_file: continue seq_newfilepath = "%s/refgenome_%s.faa" % (seq_dir, seq_num) seq_filepath = "%s/%s" % (seq_dir, seq_file) os.rename(seq_filepath, seq_newfilepath) # Submit bsub job array on refgenome_${LSB_JOBINDEX}.faa against mygenome bsub_dir = "bsub" util.checkDir(bsub_dir) cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/refgenome_${LSB_JOBINDEX}.faa %s > %s/refgenome_${LSB_JOBINDEX}.fa" % (seq_dir, genome_file, res_dir) util.submitJobArray(jobname="genepy-recipfasta", jobnum=seq_num, jobdir=bsub_dir, cmd=cmd) util.submitJobDependency('genepy-recipfasta') logger.info("Reciprocal Fasta on LSF finished") else: # List of inhouse extracted genome sequences for seq_file in os.listdir(seq_dir): if not '.faa' in seq_file: continue res_file = seq_file.split(".")[0] + ".fa" cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/%s %s > %s/%s" % (seq_dir, seq_file, genome_file, res_dir, res_file) util.runProcess(cmd) logger.info(seq_file) logger.info("Reciprocal Fasta finished")
def runHamapScan(seq_dir, hamap_dir): """ HAMAP: High-quality Automated and Manual Annotation of microbial Proteomes ftp download site: ftp://ftp.expasy.org/databases/hamap/ pfscan compares a protein or nucleic acid sequence against a profile library. The result is an unsorted list of profile-sequence matches. download site: http://www.isrec.isb-sib.ch/ftp-server/pftools/pft2.3/ """ util.createDir(hamap_dir) util.checkDir(seq_dir) hamap_profile_file = "%s/hamap/hamap.prf" % os.path.dirname(__file__) if IS_LSF: # Rename new genome sequences for job array to be mygenome_1.faa mygenome_2.faa ... seq_num = 0 for seq_file in os.listdir(seq_dir): if not '.faa' in seq_file: continue seq_num += 1 if 'mygenome_' in seq_file and '.faa' in seq_file: continue seq_newfilepath = "%s/mygenome_%s.faa" % (seq_dir, seq_num) seq_filepath = "%s/%s" % (seq_dir, seq_file) os.rename(seq_filepath, seq_newfilepath) # Submit bsub job array on mygenome_${LSB_JOBINDEX}.faa against hamap profile bsub_dir = "bsub" util.checkDir(bsub_dir) cmd = "pfscan -klf %s/mygenome_${LSB_JOBINDEX}.faa %s > %s/mygenome_${LSB_JOBINDEX}.out" % (seq_dir, hamap_profile_file, hamap_dir) util.submitJobArray(jobname='genepy-hamap', jobnum=seq_num, jobdir=bsub_dir, cmd=cmd) util.submitJobDependency('genepy-hamap') logger.info("HAMAP scan on LSF finished") else: # List of new genome sequences for seq_file in os.listdir(seq_dir): if not '.faa' in seq_file: continue res_file = seq_file.split(".")[0] + ".out" cmd = "pfscan -klf %s/%s %s > %s/%s" % (seq_dir, seq_file, hamap_profile_file, hamap_dir, res_file) util.runProcess(cmd) logger.info("HAMAP scan finished")