예제 #1
0
def runFasta(seq_dir, genomes_dir, fasta_dir):
    """
    Run FASTA on protein sequences between new genome against all in house genomes
    
    FASTA searches a protein or DNA sequence data bank
     version 35.04 Aug. 25, 2009
     W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
    """
    util.createDir(fasta_dir)
    # List of in-house genomes
    util.checkDir(genomes_dir)
    genome_files = []
    logger.info("Create fasta results directory for each in-house reference genome")
    for genome_file in os.listdir(genomes_dir):
        if '.faa' in genome_file:
            genome_files.append(genome_file)
            # Create fasta results directory for each in-house genome
            util.createDir("%s/%s" % (fasta_dir, genome_file.split(".")[0]))
            logger.info(genome_file)

    util.checkDir(seq_dir)
    if IS_LSF:
        # Rename new genome sequences for job array to be mygenome_1.faa mygenome_2.faa ...
        seq_num = 0
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            seq_num += 1
            if 'mygenome_' in seq_file and '.faa' in seq_file:
                continue
            seq_newfilepath = "%s/mygenome_%s.faa" % (seq_dir, seq_num)
            seq_filepath = "%s/%s" % (seq_dir, seq_file)
            os.rename(seq_filepath, seq_newfilepath)
        # Submit bsub job array on mygenome_${LSB_JOBINDEX}.faa against one refgenome at a time
        bsub_dir = "bsub"
        util.checkDir(bsub_dir)
        for genome_file in genome_files:
            res_dir = "%s/%s" % (fasta_dir, genome_file.split(".")[0])
            cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/mygenome_${LSB_JOBINDEX}.faa %s/%s > %s/mygenome_${LSB_JOBINDEX}.fa" % (seq_dir, genomes_dir, genome_file, res_dir)
            util.submitJobArray(jobname="genepy-fasta", jobnum=seq_num, jobdir=bsub_dir, cmd=cmd)
        util.submitJobDependency('genepy-fasta')
        logger.info("Fasta on LSF finished")
    else:
        # List of new genome sequences
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            res_file = seq_file.split(".")[0] + ".fa"
            for genome_file in genome_files:
                res_dir = "%s/%s" % (fasta_dir, genome_file.split(".")[0])
                cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/%s %s/%s > %s/%s" % (seq_dir, seq_file, genomes_dir, genome_file, res_dir, res_file)
                util.runProcess(cmd)
            logger.info(seq_file)
        logger.info("Fasta finished")
예제 #2
0
def runReciprocalFasta(seq_dir, genome_file, fasta_dir):
    """
    Run FASTA between extracted in-house protein sequences against new genome 
    
    FASTA searches a protein or DNA sequence data bank
     version 35.04 Aug. 25, 2009
     W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
    """
    util.createDir(fasta_dir)
    # Check new genome
    util.checkFile(genome_file)
    # Check ref genome extracted sequences
    util.checkDir(seq_dir)
    res_dir = fasta_dir
    if IS_LSF:
        # Rename new genome sequences for job array to be refgenome_1.faa refgenome_2.faa ...
        seq_num = 0
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            seq_num += 1
            if 'refgenome_' in seq_file and '.faa' in seq_file:
                continue
            seq_newfilepath = "%s/refgenome_%s.faa" % (seq_dir, seq_num)
            seq_filepath = "%s/%s" % (seq_dir, seq_file)
            os.rename(seq_filepath, seq_newfilepath)
        # Submit bsub job array on refgenome_${LSB_JOBINDEX}.faa against mygenome
        bsub_dir = "bsub"
        util.checkDir(bsub_dir)
        cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/refgenome_${LSB_JOBINDEX}.faa %s > %s/refgenome_${LSB_JOBINDEX}.fa" % (seq_dir, genome_file, res_dir)
        util.submitJobArray(jobname="genepy-recipfasta", jobnum=seq_num, jobdir=bsub_dir, cmd=cmd)
        util.submitJobDependency('genepy-recipfasta')
        logger.info("Reciprocal Fasta on LSF finished")
    else:
        # List of inhouse extracted genome sequences
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            res_file = seq_file.split(".")[0] + ".fa"
            cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/%s %s > %s/%s" % (seq_dir, seq_file, genome_file, res_dir, res_file)
            util.runProcess(cmd)
            logger.info(seq_file)
        logger.info("Reciprocal Fasta finished")
예제 #3
0
def runHamapScan(seq_dir, hamap_dir):
    """
    HAMAP: High-quality Automated and Manual Annotation of microbial Proteomes
    ftp download site: ftp://ftp.expasy.org/databases/hamap/
     
    pfscan compares a protein or nucleic acid sequence against a profile 
    library. The result is an unsorted list of profile-sequence matches.
    download site: http://www.isrec.isb-sib.ch/ftp-server/pftools/pft2.3/
    """
    util.createDir(hamap_dir)
    util.checkDir(seq_dir)
    hamap_profile_file = "%s/hamap/hamap.prf" % os.path.dirname(__file__)
    if IS_LSF:
        # Rename new genome sequences for job array to be mygenome_1.faa mygenome_2.faa ...
        seq_num = 0
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            seq_num += 1
            if 'mygenome_' in seq_file and '.faa' in seq_file:
                continue
            seq_newfilepath = "%s/mygenome_%s.faa" % (seq_dir, seq_num)
            seq_filepath = "%s/%s" % (seq_dir, seq_file)
            os.rename(seq_filepath, seq_newfilepath)
        # Submit bsub job array on mygenome_${LSB_JOBINDEX}.faa against hamap profile
        bsub_dir = "bsub"
        util.checkDir(bsub_dir)
        cmd = "pfscan -klf %s/mygenome_${LSB_JOBINDEX}.faa %s > %s/mygenome_${LSB_JOBINDEX}.out" % (seq_dir, hamap_profile_file, hamap_dir)
        util.submitJobArray(jobname='genepy-hamap', jobnum=seq_num, jobdir=bsub_dir, cmd=cmd)
        util.submitJobDependency('genepy-hamap')
        logger.info("HAMAP scan on LSF finished")
    else:
        # List of new genome sequences
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            res_file = seq_file.split(".")[0] + ".out"
            cmd = "pfscan -klf %s/%s %s > %s/%s" % (seq_dir, seq_file, hamap_profile_file, hamap_dir, res_file)
            util.runProcess(cmd)
        logger.info("HAMAP scan finished")