예제 #1
0
    def __init__(self, user=None, passwd=None, host=None, port =None, check=True):
        '''
        Loads the utils configuration (utils.cfg)
        
        if (not os.path.isfile("../utils.cfg")):
            raise IOError("There is no utils.cfg file present in the project directory.")
        
        config = ConfigParser.RawConfigParser()
        config.read("../utils.cfg")
        '''
        
        self.configReader = ConfigurationReader(user, passwd, host, port, check)
        # blast tools
        
        
        self.fastacmd  = self.configReader.get_path('fastacmd')
        self.blastall  = self.configReader.get_path('mafft')

        self.blastp    = self.configReader.get_path('blastall')
        self.blastp    += " -p blastp -e "+ self.configReader.get_value('blastp_e_value')
        

        # ensembl database
        self.ensembldb      = self.configReader.get_path('ensembl_fasta')
        
        # Smith-Waterman
        self.sw_sharp       = self.configReader.get_path('sw#')
        # hacked blast, to be used by swsharp to align exons while respecting the boundaries
        self.blosum_matrix  = "{0}/{1}".format(self.configReader.get_path('resources'),
                                               self.configReader.get_value('blosum_hacked'))
        # mafft
        self.mafft          = self.configReader.get_path('mafft')
예제 #2
0
    def __init__(self, user=None, passwd=None, host=None, port=None, check=True):
        """
        Loads the utils configuration (utils.cfg)
        
        if (not os.path.isfile("../utils.cfg")):
            raise IOError("There is no utils.cfg file present in the project directory.")
        
        config = ConfigParser.RawConfigParser()
        config.read("../utils.cfg")
        """

        self.configReader = ConfigurationReader(user, passwd, host, port, check)
        # blast tools
        self.fastacmd = self.configReader.get_path("fastacmd")
        self._check_exists(self.fastacmd)

        self.blastall = self.configReader.get_path("blastall")
        self._check_exists(self.blastall)

        self.blastp = self.configReader.get_path("blastall")
        self.blastp += " -p blastp -e " + str(self.configReader.get_value("blastp_e_value"))

        # ensembl database
        self.ensembldb = self.configReader.get_path("ensembl_fasta")
        self._check_exists(self.ensembldb)

        # Smith-Waterman
        self.sw_sharp = self.configReader.get_path("sw#")
        self._check_exists(self.sw_sharp)

        # usearch
        self.usearch = self.configReader.get_path("usearch")
        self._check_exists(self.usearch)

        # hacked blast, to be used by swsharp to align exons while respecting the boundaries
        self.blosum_matrix = "{0}/{1}".format(
            self.configReader.get_path("resources"), self.configReader.get_value("blosum_hacked")
        )
        # mafft
        self.mafft = self.configReader.get_path("mafft")
        self._check_exists(self.mafft)

        # maxent scan
        self.scan3 = self.configReader.get_path("score3")
        self.scan5 = self.configReader.get_path("score5")
        self.maxent_homedir = self.configReader.get_path("maxentscan")
예제 #3
0
class AlignmentCommandGenerator(object):
    '''
    Generates commands for utilities that are used (blast, sw, genewise, fastacmd, formatdb)
    '''

    def __init__(self, user=None, passwd=None, host=None, port =None, check=True):
        '''
        Loads the utils configuration (utils.cfg)
        
        if (not os.path.isfile("../utils.cfg")):
            raise IOError("There is no utils.cfg file present in the project directory.")
        
        config = ConfigParser.RawConfigParser()
        config.read("../utils.cfg")
        '''
        
        self.configReader = ConfigurationReader(user, passwd, host, port, check)
        # blast tools
        
        
        self.fastacmd  = self.configReader.get_path('fastacmd')
        self.blastall  = self.configReader.get_path('mafft')

        self.blastp    = self.configReader.get_path('blastall')
        self.blastp    += " -p blastp -e "+ self.configReader.get_value('blastp_e_value')
        

        # ensembl database
        self.ensembldb      = self.configReader.get_path('ensembl_fasta')
        
        # Smith-Waterman
        self.sw_sharp       = self.configReader.get_path('sw#')
        # hacked blast, to be used by swsharp to align exons while respecting the boundaries
        self.blosum_matrix  = "{0}/{1}".format(self.configReader.get_path('resources'),
                                               self.configReader.get_value('blosum_hacked'))
        # mafft
        self.mafft          = self.configReader.get_path('mafft')
       
        
        
    def generate_fastacmd_gene_command (self, species, seq_name, fasta_db_file, 
                                   strand = None, 
                                   sequence_start = None, sequence_stop = None, 
                                   output_file_path = None):
        
        if (isinteger(seq_name)):
            seq_id_cmd = "-s 'lcl|%s' " % seq_name
        else:
            seq_id_cmd = "-s %s" % seq_name
        

        data_type_cmd = "-p F"
        database = "-d {0}/{1}/dna/{2}".format(self.ensembldb, species, fasta_db_file)
            
        if (strand == None or int(strand) == 1):
            strand_cmd = "-S 1"
        else:
            strand_cmd = "-S 2"
            
        if (sequence_start and sequence_stop):
            location_cmd = "-L %s,%s" % (sequence_start, sequence_stop)
        else:
            location_cmd = ""
            
        if output_file_path:
            output_cmd = "-o %s" % output_file_path
        else:
            output_cmd = ""


        fastacmd_cmd_line  = self.fastacmd
        fastacmd_cmd_line +=  " {0} {1} {2} {3} {4} {5}".format(database, seq_id_cmd, 
                                                               data_type_cmd, strand_cmd,
                                                               location_cmd, output_cmd)
        return fastacmd_cmd_line
        
    
    def generate_fastacmd_plain (self, database, seq_id, output_file_path = ''):
        
        database_cmd = "-d %s" % database
        if (isinteger(seq_id)):
            seq_id_cmd = "-s 'lcl|%s' " % seq_id
        else:
            seq_id_cmd = "-s %s" % seq_id
        if (output_file_path):
            output_cmd  = "-o %s" % output_file_path
        else:
            output_cmd  = ""

        fastacmd_cmd_line  = self.fastacmd
        fastacmd_cmd_line +=  " {0} {1} {2} ".format(database_cmd, seq_id_cmd, output_cmd)
        return fastacmd_cmd_line
 
    def generate_fastacmd_protein_command (self, protein_id, species_name, protein_type, output_file_path):
        
        data_type_cmd = "-p T"
        prot_id_cmd = "-s %s" % protein_id
        database = "-d %s" % self._generate_proteindb_file_name(species_name, protein_type)
        if (output_file_path):
            output_cmd = "-o %s" % output_file_path
        else:
            output_cmd = ""
        return "fastacmd {0} {1} {2} {3}".format(prot_id_cmd, data_type_cmd, database, output_cmd)
    
    
    def generate_blastp_plain (self, database, input_file, output_file):
        
        # sombbody hardcoded the  output format in the self.blastp
        blastp = self.blastp + " -m 8 "
        if output_file:
            cmd = "{0} -d {1} -i {2} -o {3}".format(blastp, database, input_file, output_file)
        else:
            cmd = "{0} -d {1} -i {2} ".format(blastp, database, input_file)
            
        return cmd

    def generate_blastp_command (self, database, input_file, output_file):
        if output_file:
            cmd = "{0} -d {1} -i {2} -o {3}".format(self.blastp, database, input_file, output_file)
        else:
            cmd = "{0} -d {1} -i {2} ".format(self.blastp, database, input_file)
            
        return cmd

    
    def generate_SW_nt (self, query_sequence_file, target_fasta_db_file, 
                             output_file, supress_stdout = True):
        # Matija's current implementation is switching the order
        cmd = "{0} -j {1} -i {2} --out {3}".format(self.sw_sharp, query_sequence_file, 
                                                   target_fasta_db_file, output_file)
        if supress_stdout:
            cmd += " > /dev/null"
        return cmd
    
    def generate_SW_peptide (self, query_sequence_file, target_fasta_db_file, 
                             output_file = None):
        cmd  = "{0} --verbose 0 --matrix-file {1}  ".format(self.sw_sharp, self.blosum_matrix)
        cmd += " -j {0} -i {1} ".format(query_sequence_file, target_fasta_db_file)
        cmd += " --out-type 1 --gap-open 3.0   "
        if output_file:
            cmd += " --out {0} ".format(output_file)
        return cmd
    
    
    def generate_formatdb_command (self, input_db_file, sequence_type):
        
        if sequence_type == "protein" or sequence_type == "P":
            cmd = "formatdb -i {0} -p T".format(input_db_file)
        else:
            cmd = "formatdb -i {0} -p F".format(input_db_file)
            
        return cmd
    
    def generate_mafft_command (self, input_file, output_file=None):
        '''
        If there are unusual characters (e.g., U as selenocysteine in protein sequence), 
        use the --anysymbol option. It accepts any printable characters (U, O, #, $, %, etc.; 
        0x21-0x7e in the ASCII code), execpt for > (0x3e).  
        They are scored equivalently to X.  Gap is - (0x2d), as in the default mode. 
        '''
        if output_file:
            return "{0} --quiet --anysymbol {1} > {2}".format(self.mafft, input_file, output_file)
        else:
            return "{0} --quiet --anysymbol {1} ".format(self.mafft, input_file)
    
    def _generate_genedb_file_name (self, species, sequence_type, sequence_id, masked):
        '''
        @param species: species name
        @param sequence_type: scaffold / chromosome...
        @param sequence_id: ensembl sequence ID
        @param masked: 0 if dna should not be masked, 1 if it should
        '''
        file_name = "{0}/{1}/dna".format(self.ensembldb, species.lower())
        # get the template name (dependent on the assembly)
        tmp_file=""
        for f in os.listdir(file_name):
            # check if we've stumbled on the protein file
            m = re.findall(".dna", f)
            if not m:
                continue
            tmp_file = f
        m = re.findall ('(.*).dna', tmp_file)   
        if (masked != 0):
            file_name = "%s/%s.dna_rm." % (file_name, m[0])
        else :
            file_name = "%s/%s.dna." % (file_name, m[0])
        if (sequence_type == 'chromosome'):
            file_name = "%schromosome.%s.fa" % (file_name, sequence_id)
        else :
            file_name = "%stoplevel.fa" % (file_name)

        if (not os.path.exists(file_name)):
            # find *dna.toplevel.fa file
            path =  "{0}/{1}/dna".format(self.ensembldb, species.lower())
            cmd = "ls "+path+"/*dna.toplevel.fa"
            retval =  commands.getoutput(cmd)
            if (retval):
                file_name = retval.rstrip()
                # otherwise just let the thing fail, I have no better idea
        return file_name
       
       
    def _generate_proteindb_file_name (self, species, protein_type):
        '''
        @param species: species name (ensembl)
        @param protein_type: all / abinitio
        @return: protein database name
        '''
        file_name = "%s/%s/pep" % (self.ensembldb, species.lower())
        tmp_file=""
        for f in os.listdir(file_name):
            # check if we've stumbled on the protein file
            m = re.findall(".pep", f)
            if not m:
                continue
            tmp_file = f
        m = re.findall ('(.*).pep', tmp_file)
        if (protein_type == "all"):
            file_name = "%s/%s.pep.all.fa" % (file_name, m[0])
        else:
            file_name = "%s/%s.pep.abinitio.fa" % (file_name, m[0])
            
        return file_name
예제 #4
0
class AlignmentCommandGenerator(object):
    """
    Generates commands for utilities that are used (blast, sw, genewise, fastacmd, formatdb)
    """

    def _check_exists(self, item):
        if not os.path.exists(item):
            print item, " not found "
            exit(1)

    def __init__(self, user=None, passwd=None, host=None, port=None, check=True):
        """
        Loads the utils configuration (utils.cfg)
        
        if (not os.path.isfile("../utils.cfg")):
            raise IOError("There is no utils.cfg file present in the project directory.")
        
        config = ConfigParser.RawConfigParser()
        config.read("../utils.cfg")
        """

        self.configReader = ConfigurationReader(user, passwd, host, port, check)
        # blast tools
        self.fastacmd = self.configReader.get_path("fastacmd")
        self._check_exists(self.fastacmd)

        self.blastall = self.configReader.get_path("blastall")
        self._check_exists(self.blastall)

        self.blastp = self.configReader.get_path("blastall")
        self.blastp += " -p blastp -e " + str(self.configReader.get_value("blastp_e_value"))

        # ensembl database
        self.ensembldb = self.configReader.get_path("ensembl_fasta")
        self._check_exists(self.ensembldb)

        # Smith-Waterman
        self.sw_sharp = self.configReader.get_path("sw#")
        self._check_exists(self.sw_sharp)

        # usearch
        self.usearch = self.configReader.get_path("usearch")
        self._check_exists(self.usearch)

        # hacked blast, to be used by swsharp to align exons while respecting the boundaries
        self.blosum_matrix = "{0}/{1}".format(
            self.configReader.get_path("resources"), self.configReader.get_value("blosum_hacked")
        )
        # mafft
        self.mafft = self.configReader.get_path("mafft")
        self._check_exists(self.mafft)

        # maxent scan
        self.scan3 = self.configReader.get_path("score3")
        self.scan5 = self.configReader.get_path("score5")
        self.maxent_homedir = self.configReader.get_path("maxentscan")

    def generate_fastacmd_gene_command(
        self,
        species,
        seq_name,
        fasta_db_file,
        strand=None,
        sequence_start=None,
        sequence_stop=None,
        output_file_path=None,
    ):

        if seq_name.isdigit():
            seq_id_cmd = "-s 'lcl|%s' " % seq_name
        else:
            seq_id_cmd = "-s %s" % seq_name

        data_type_cmd = "-p F"
        database = "-d {0}/{1}/dna/{2}".format(self.ensembldb, species, fasta_db_file)

        if strand == None or int(strand) == 1:
            strand_cmd = "-S 1"
        else:
            strand_cmd = "-S 2"

        if not sequence_start and not sequence_stop:
            location_cmd = ""
        else:
            location_cmd = "-L %s,%s" % (
                sequence_start if sequence_start else "",
                sequence_stop if sequence_stop else "",
            )

        if output_file_path:
            output_cmd = "-o %s" % output_file_path
        else:
            output_cmd = ""

        fastacmd_cmd_line = self.fastacmd
        fastacmd_cmd_line += " {0} {1} {2} {3} {4} {5}".format(
            database, seq_id_cmd, data_type_cmd, strand_cmd, location_cmd, output_cmd
        )
        return fastacmd_cmd_line

    def generate_fastacmd_plain(self, database, seq_id, output_file_path=""):

        database_cmd = "-d %s" % database
        if isinteger(seq_id):
            seq_id_cmd = "-s 'lcl|%s' " % seq_id
        else:
            seq_id_cmd = "-s %s" % seq_id
        if output_file_path:
            output_cmd = "-o %s" % output_file_path
        else:
            output_cmd = ""

        fastacmd_cmd_line = self.fastacmd
        fastacmd_cmd_line += " {0} {1} {2} ".format(database_cmd, seq_id_cmd, output_cmd)
        return fastacmd_cmd_line

    def generate_fastacmd_protein_command(self, protein_id, species_name, protein_type, output_file_path):

        data_type_cmd = "-p T"
        prot_id_cmd = "-s %s" % protein_id
        database = "-d %s" % self._generate_proteindb_file_name(species_name, protein_type)
        if output_file_path:
            output_cmd = "-o %s" % output_file_path
        else:
            output_cmd = ""
        return "fastacmd {0} {1} {2} {3}".format(prot_id_cmd, data_type_cmd, database, output_cmd)

    def generate_blastp_plain(self, database, input_file, output_file):

        # sombbody hardcoded the  output format in the self.blastp
        blastp = self.blastp + " -m 8 "
        if output_file:
            cmd = "{0} -d {1} -i {2} -o {3}".format(blastp, database, input_file, output_file)
        else:
            cmd = "{0} -d {1} -i {2} ".format(blastp, database, input_file)

        return cmd

    def generate_blastp_command(self, database, input_file, output_file):
        if output_file:
            cmd = "{0} -d {1} -i {2} -o {3}".format(self.blastp, database, input_file, output_file)
        else:
            cmd = "{0} -d {1} -i {2} ".format(self.blastp, database, input_file)

        return cmd

    def generate_usearch_nt(
        self, query_sequence_file, target_fasta_db_file, output_file, identity="0.5", strand="plus"
    ):
        # strand can also be both
        cmd = "{0} -threads 1   -search_local {1}  -db {2}  -alnout {3} -id {4} -strand {5}  ".format(
            self.usearch, query_sequence_file, target_fasta_db_file, output_file, identity, strand
        )
        return cmd

    def generate_SW_nt(self, query_sequence_file, target_fasta_db_file, output_file=None, supress_stdout=False):
        # Matija's current implementation is switching the order
        cmd = "{0} --verbose 0 -j {1} -i {2}".format(self.sw_sharp, query_sequence_file, target_fasta_db_file)
        if not output_file == None:
            cmd += " --out " + output_file
        if supress_stdout:
            cmd += " > /dev/null"
        return cmd

    def generate_SW_peptide(self, query_sequence_file, target_fasta_db_file, output_file=None):
        cmd = "{0} --verbose 0 --matrix-file {1}  ".format(self.sw_sharp, self.blosum_matrix)
        cmd += " -j {0} -i {1} ".format(query_sequence_file, target_fasta_db_file)
        cmd += " --out-type 1 --gap-open 3.0   "
        if output_file:
            cmd += " --out {0} ".format(output_file)
        return cmd

    def generate_formatdb_command(self, input_db_file, sequence_type):

        if sequence_type == "protein" or sequence_type == "P":
            cmd = "formatdb -i {0} -p T".format(input_db_file)
        else:
            cmd = "formatdb -i {0} -p F".format(input_db_file)

        return cmd

    def generate_mafft_command(self, input_file, output_file=None):
        """
        If there are unusual characters (e.g., U as selenocysteine in protein sequence), 
        use the --anysymbol option. It accepts any printable characters (U, O, #, $, %, etc.; 
        0x21-0x7e in the ASCII code), execpt for > (0x3e).  
        They are scored equivalently to X.  Gap is - (0x2d), as in the default mode. 
        """
        if output_file:
            return "{0} --quiet --anysymbol {1} > {2}".format(self.mafft, input_file, output_file)
        else:
            return "{0} --quiet --anysymbol {1} ".format(self.mafft, input_file)

    def generate_mafft_profile(self, input_file_1, input_file_2, output_file=None):
        if output_file:
            return "{0}-profile {1} {2} > {3}".format(self.mafft, input_file_1, input_file_2, output_file)
        else:
            return "{0}-profile {1} {2} ".format(self.mafft, input_file_1, input_file_2)

    def generate_maxentscan_cmd(self, intron_side, input_file):
        if intron_side == 3:
            return "{0}  {1} {2}  ".format(self.scan3, self.maxent_homedir, input_file)
        elif intron_side == 5:
            return "{0}  {1} {2}  ".format(self.scan5, self.maxent_homedir, input_file)
        else:
            return ""

    def _generate_genedb_file_name(self, species, sequence_type, sequence_id, masked):
        """
        @param species: species name
        @param sequence_type: scaffold / chromosome...
        @param sequence_id: ensembl sequence ID
        @param masked: 0 if dna should not be masked, 1 if it should
        """
        file_name = "{0}/{1}/dna".format(self.ensembldb, species.lower())
        # get the template name (dependent on the assembly)
        tmp_file = ""
        for f in os.listdir(file_name):
            # check if we've stumbled on the protein file
            m = re.findall(".dna", f)
            if not m:
                continue
            tmp_file = f
        m = re.findall("(.*).dna", tmp_file)
        if masked != 0:
            file_name = "%s/%s.dna_rm." % (file_name, m[0])
        else:
            file_name = "%s/%s.dna." % (file_name, m[0])
        if sequence_type == "chromosome":
            file_name = "%schromosome.%s.fa" % (file_name, sequence_id)
        else:
            file_name = "%stoplevel.fa" % (file_name)

        if not os.path.exists(file_name):
            # find *dna.toplevel.fa file
            path = "{0}/{1}/dna".format(self.ensembldb, species.lower())
            cmd = "ls " + path + "/*dna.toplevel.fa"
            retval = commands.getoutput(cmd)
            if retval:
                file_name = retval.rstrip()
                # otherwise just let the thing fail, I have no better idea
        return file_name

    def _generate_proteindb_file_name(self, species, protein_type):
        """
        @param species: species name (ensembl)
        @param protein_type: all / abinitio
        @return: protein database name
        """
        file_name = "%s/%s/pep" % (self.ensembldb, species.lower())
        tmp_file = ""
        for f in os.listdir(file_name):
            # check if we've stumbled on the protein file
            m = re.findall(".pep", f)
            if not m:
                continue
            tmp_file = f
        m = re.findall("(.*).pep", tmp_file)
        if protein_type == "all":
            file_name = "%s/%s.pep.all.fa" % (file_name, m[0])
        else:
            file_name = "%s/%s.pep.abinitio.fa" % (file_name, m[0])

        return file_name