예제 #1
0
파일: annotate.py 프로젝트: pajanne/rococo
def main():
    options = mainUsage()
    
    script = options.run
    checkDoRun(script)

    try:
        # Read organism common name and related fasta sequence file
        list_file = options.list
        util.checkFile(list_file)
        for line in open(list_file, "r"):
            if line[0] == '!':
                continue
            if line.count('||') < 1:
                continue
            # ! common_name||sequence_file
            common_name, input_file = util.splitLine(line)
            util.checkFile(input_file)
            # Run command
            cmd = 'python -c "from genepy.annotators.%s import doRun; doRun()" -o %s -i %s' % (script, common_name, input_file)
            if util.isLsf():
                job_name = "%s.%s" % (common_name, ANNOTATOR_EXTENSION[script])
                util.submitJob(job_name, cmd, ANNOTATOR_QUEUE[script])
            else:
                util.runProcess(cmd)
    except Exception, e:
        log.error(e)
예제 #2
0
def fasta2embl(infasta):
    """
    Transform sequence file format in fasta to embl using EMBOSS seqret
    Returns the name of created embl file
    """
    util.checkFile(infasta)
    outembl = infasta.split(".")[0] + ".embl"
    """
    Usage: seqret 
    Online documentation: http://emboss.open-bio.org/wiki/Appdoc:Seqret
    
      Standard (Mandatory) qualifiers:
      [-sequence]          seqall     (Gapped) sequence(s) filename and optional
                                      format, or reference (input USA)
      [-outseq]            seqoutall  [<sequence>.<format>] Sequence set(s)
                                      filename and optional format (output USA)
    
      The basic USA syntax is one of:
        "file"
        "file:entry"
        "format::file"
        "format::file:entry"
        "database:entry"
        "database"
        "@file"
    """
    # Create EMBOSS seqret command line
    cmd = "seqret -sequence fasta::%s -outseq embl::%s " % (infasta, outembl)
    # Call the subprocess using convenience method
    util.runProcess(cmd)
    logger.info("File", outembl, "created")
    return outembl
예제 #3
0
def splitSeq(dir, embl, type):
    """
    Split sequence into separate file based on CDS features into dir/ directory
    based on EMBOSS extractfeat
    
    Usage: extractfeat
    Online documentation: http://emboss.open-bio.org/wiki/Appdoc:Extractfeat
    
      Standard (Mandatory) qualifiers:
      [-sequence]          seqall     Sequence(s) filename and optional format, or
                                      reference (input USA)
      [-outseq]            seqout     [.] Sequence filename and
                                      optional format (output USA)
   
      Additional (Optional) qualifiers:
       -type               string     [*] By default every feature in the feature
                                      table is extracted. You can set this to be
                                      any feature type you wish to extract.
                                      See http://www.ebi.ac.uk/Services/WebFeat/
                                      for a list of the EMBL feature types and see
                                      the Uniprot user manual in
                                      http://www.uniprot.org/manual/sequence_annotation
                                      for a list of the Uniprot feature types.
                                      The type may be wildcarded by using '*'.
                                      If you wish to extract more than one type,
                                      separate their names with the character '|',
                                      eg:
                                      *UTR | intron (Any string is accepted)
       -featinname         boolean    [N] To aid you in identifying the type of
                                      feature that has been output, the type of
                                      feature is added to the start of the
                                      description of the output sequence.
                                      Sometimes the description of a sequence is
                                      lost in subsequent processing of the
                                      sequences file, so it is useful for the type
                                      to be a part of the sequence ID name. If
                                      you set this to be TRUE then the name is
                                      added to the ID name of the output sequence.

       Associated qualifiers:
       "-outseq" associated qualifiers
       -ossingle2          boolean    Separate file for each entry
       -ofdirectory2       string     Output directory

      The basic USA syntax is one of:
        "file"
        "file:entry"
        "format::file"
        "format::file:entry"
        "database:entry"
        "database"
        "@file"
    """
    util.checkFile(embl)
    # Create directory
    util.createDir(dir)
    cmd = "extractfeat -sequence embl::%s -type %s -featinname YES -outseq fasta:: -osextension2 ffn -ossingle2 Yes -osdirectory2 %s" % (embl, type, dir)
    util.runProcess(cmd)
    logger.info("Sequences extracted into %s" % dir)
예제 #4
0
def concatSeq(genome_file, dir):
    """
    Concatenate separated CDS sequence fasta files located in dir into one file
    """
    util.checkDir(dir)
    if os.path.exists(genome_file):
        os.remove(genome_file)
    cmd = "cat %s/*.faa > %s" % (dir, genome_file)
    util.runProcess(cmd)
    logger.info("concatSeq finished")
예제 #5
0
def runFasta(seq_dir, genomes_dir, fasta_dir):
    """
    Run FASTA on protein sequences between new genome against all in house genomes
    
    FASTA searches a protein or DNA sequence data bank
     version 35.04 Aug. 25, 2009
     W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
    """
    util.createDir(fasta_dir)
    # List of in-house genomes
    util.checkDir(genomes_dir)
    genome_files = []
    logger.info("Create fasta results directory for each in-house reference genome")
    for genome_file in os.listdir(genomes_dir):
        if '.faa' in genome_file:
            genome_files.append(genome_file)
            # Create fasta results directory for each in-house genome
            util.createDir("%s/%s" % (fasta_dir, genome_file.split(".")[0]))
            logger.info(genome_file)

    util.checkDir(seq_dir)
    if IS_LSF:
        # Rename new genome sequences for job array to be mygenome_1.faa mygenome_2.faa ...
        seq_num = 0
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            seq_num += 1
            if 'mygenome_' in seq_file and '.faa' in seq_file:
                continue
            seq_newfilepath = "%s/mygenome_%s.faa" % (seq_dir, seq_num)
            seq_filepath = "%s/%s" % (seq_dir, seq_file)
            os.rename(seq_filepath, seq_newfilepath)
        # Submit bsub job array on mygenome_${LSB_JOBINDEX}.faa against one refgenome at a time
        bsub_dir = "bsub"
        util.checkDir(bsub_dir)
        for genome_file in genome_files:
            res_dir = "%s/%s" % (fasta_dir, genome_file.split(".")[0])
            cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/mygenome_${LSB_JOBINDEX}.faa %s/%s > %s/mygenome_${LSB_JOBINDEX}.fa" % (seq_dir, genomes_dir, genome_file, res_dir)
            util.submitJobArray(jobname="genepy-fasta", jobnum=seq_num, jobdir=bsub_dir, cmd=cmd)
        util.submitJobDependency('genepy-fasta')
        logger.info("Fasta on LSF finished")
    else:
        # List of new genome sequences
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            res_file = seq_file.split(".")[0] + ".fa"
            for genome_file in genome_files:
                res_dir = "%s/%s" % (fasta_dir, genome_file.split(".")[0])
                cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/%s %s/%s > %s/%s" % (seq_dir, seq_file, genomes_dir, genome_file, res_dir, res_file)
                util.runProcess(cmd)
            logger.info(seq_file)
        logger.info("Fasta finished")
예제 #6
0
def translateSeq(dir):
    """
    Translate nucleic acid sequence in fasta format into protein sequence using
    EMBOSS transeq
    
    Usage: transeq
    Online documentation: http://emboss.open-bio.org/wiki/Appdoc:Transeq
    
      Standard (Mandatory) qualifiers:
      [-sequence]          seqall     Nucleotide sequence(s) filename and optional
                                      format, or reference (input USA)
      [-outseq]            seqoutall  [.] Protein sequence
                                      set(s) filename and optional format (output
                                      USA)
      Additional (Optional) qualifiers:
       -table              menu       [0] Code to use (Values: 0 (Standard); 1
                                      (Standard (with alternative initiation
                                      codons)); 2 (Vertebrate Mitochondrial); 3
                                      (Yeast Mitochondrial); 4 (Mold, Protozoan,
                                      Coelenterate Mitochondrial and
                                      Mycoplasma/Spiroplasma); 5 (Invertebrate
                                      Mitochondrial); 6 (Ciliate Macronuclear and
                                      Dasycladacean); 9 (Echinoderm
                                      Mitochondrial); 10 (Euplotid Nuclear); 11
                                      (Bacterial); 12 (Alternative Yeast Nuclear);
                                      13 (Ascidian Mitochondrial); 14 (Flatworm
                                      Mitochondrial); 15 (Blepharisma
                                      Macronuclear); 16 (Chlorophycean
                                      Mitochondrial); 21 (Trematode
                                      Mitochondrial); 22 (Scenedesmus obliquus);
                                      23 (Thraustochytrium Mitochondrial))

      The basic USA syntax is one of:
        "file"
        "file:entry"
        "format::file"
        "format::file:entry"
        "database:entry"
        "database"
        "@file"
    """ 
    util.checkDir(dir)
    for file in os.listdir(dir):
        if '.ffn' in file:
            infasta = file
            outpep = file.split(".")[0] + ".faa"
            cmd = "transeq -sequence fasta::%s/%s -outseq fasta::%s/%s -table 11" % (dir, infasta, dir, outpep)
            util.runProcess(cmd)
    logger.info("Sequences translated.")
예제 #7
0
def concatFeatures(embl, features):
    """
    Concat CDS features in embl format into embl sequence file 
      - the first two lines of embl sequence containing ID & XX lines 
      - the CDS features file containing FT lines
      - the rest of embl sequence containing SQ lines
    Returns the name of created embl sequence file
    """
    util.checkFile(embl)
    util.checkFile(features)
    outembl = embl.split(".")[0] + "_with_cds.embl"
    # Create command line
    head_cmd = "head -2 %s > %s; cat %s >> %s;" % (embl, outembl, features, outembl)
    util.runProcess(head_cmd)
    tail_cmd = "tail +3 %s > tail; cat tail >> %s; rm tail;" % (embl, outembl)
    util.runProcess(tail_cmd)
    logger.info("File", outembl, "created")
    return outembl
예제 #8
0
def runReciprocalFasta(seq_dir, genome_file, fasta_dir):
    """
    Run FASTA between extracted in-house protein sequences against new genome 
    
    FASTA searches a protein or DNA sequence data bank
     version 35.04 Aug. 25, 2009
     W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
    """
    util.createDir(fasta_dir)
    # Check new genome
    util.checkFile(genome_file)
    # Check ref genome extracted sequences
    util.checkDir(seq_dir)
    res_dir = fasta_dir
    if IS_LSF:
        # Rename new genome sequences for job array to be refgenome_1.faa refgenome_2.faa ...
        seq_num = 0
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            seq_num += 1
            if 'refgenome_' in seq_file and '.faa' in seq_file:
                continue
            seq_newfilepath = "%s/refgenome_%s.faa" % (seq_dir, seq_num)
            seq_filepath = "%s/%s" % (seq_dir, seq_file)
            os.rename(seq_filepath, seq_newfilepath)
        # Submit bsub job array on refgenome_${LSB_JOBINDEX}.faa against mygenome
        bsub_dir = "bsub"
        util.checkDir(bsub_dir)
        cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/refgenome_${LSB_JOBINDEX}.faa %s > %s/refgenome_${LSB_JOBINDEX}.fa" % (seq_dir, genome_file, res_dir)
        util.submitJobArray(jobname="genepy-recipfasta", jobnum=seq_num, jobdir=bsub_dir, cmd=cmd)
        util.submitJobDependency('genepy-recipfasta')
        logger.info("Reciprocal Fasta on LSF finished")
    else:
        # List of inhouse extracted genome sequences
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            res_file = seq_file.split(".")[0] + ".fa"
            cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/%s %s > %s/%s" % (seq_dir, seq_file, genome_file, res_dir, res_file)
            util.runProcess(cmd)
            logger.info(seq_file)
        logger.info("Reciprocal Fasta finished")
예제 #9
0
def runHamapScan(seq_dir, hamap_dir):
    """
    HAMAP: High-quality Automated and Manual Annotation of microbial Proteomes
    ftp download site: ftp://ftp.expasy.org/databases/hamap/
     
    pfscan compares a protein or nucleic acid sequence against a profile 
    library. The result is an unsorted list of profile-sequence matches.
    download site: http://www.isrec.isb-sib.ch/ftp-server/pftools/pft2.3/
    """
    util.createDir(hamap_dir)
    util.checkDir(seq_dir)
    hamap_profile_file = "%s/hamap/hamap.prf" % os.path.dirname(__file__)
    if IS_LSF:
        # Rename new genome sequences for job array to be mygenome_1.faa mygenome_2.faa ...
        seq_num = 0
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            seq_num += 1
            if 'mygenome_' in seq_file and '.faa' in seq_file:
                continue
            seq_newfilepath = "%s/mygenome_%s.faa" % (seq_dir, seq_num)
            seq_filepath = "%s/%s" % (seq_dir, seq_file)
            os.rename(seq_filepath, seq_newfilepath)
        # Submit bsub job array on mygenome_${LSB_JOBINDEX}.faa against hamap profile
        bsub_dir = "bsub"
        util.checkDir(bsub_dir)
        cmd = "pfscan -klf %s/mygenome_${LSB_JOBINDEX}.faa %s > %s/mygenome_${LSB_JOBINDEX}.out" % (seq_dir, hamap_profile_file, hamap_dir)
        util.submitJobArray(jobname='genepy-hamap', jobnum=seq_num, jobdir=bsub_dir, cmd=cmd)
        util.submitJobDependency('genepy-hamap')
        logger.info("HAMAP scan on LSF finished")
    else:
        # List of new genome sequences
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            res_file = seq_file.split(".")[0] + ".out"
            cmd = "pfscan -klf %s/%s %s > %s/%s" % (seq_dir, seq_file, hamap_profile_file, hamap_dir, res_file)
            util.runProcess(cmd)
        logger.info("HAMAP scan finished")