示例#1
0
    def __init__(self, input_hmm: str, input_db: str, parameters: any,
                 outdir: str, **kwargs: dict):
        """

        @param input_hmm: hmm profile filename
        @param input_db: name of the fasta to scan against the HMM profile
        @param parameters: Param instance
        @param outdir: output directory
        @param **kwargs: optional argument(s):
            - 'basename': desired name of the output file
            - 'domains': list of domain names to be fetched from input_hmm to a temporary new hmmdb
        """
        if 'basename' not in kwargs:
            self.basename = '.'.join(
                os.path.basename(input_hmm).split('.')[0:-1])
        else:
            self.basename = '.'.join(str(kwargs['basename']).split('.')[0:-1])
        self.param = parameters
        self.input_db = input_db
        self.outdir = outdir
        self.output = outdir + self.basename + '.domtblout'

        self.hmmfetch = False
        if 'domains' not in kwargs:
            self.input_hmm = input_hmm
        else:
            self.hmmfetch = True
            self.domains = kwargs['domains']
            self.input_hmm = hmmfetch(hmmdb=input_hmm,
                                      keys=self.domains,
                                      outdir=self.outdir)

        self.hits = None

        self.logger = logHandler.Logger(name=__name__)
示例#2
0
    def __init__(self, _input: str, input_db: str, param: Param):
        """

        @param _input: seed fasta filename
        @param input_db: name of the fasta to scan against the HMM profile for the enrichment process
        @param param: Param instance
        """
        self.input_hmmbuilder = _input
        self.basename = '.'.join(os.path.basename(_input).split('.')[0:-1])
        self.input_db = input_db
        self.fasta_dict = seqio.get_fasta_dict(fasta_filename=input_db)
        self.param = param
        self.iter_index = 1
        self.outdir_base = self.param.outdirname
        self.convergence_status = Convergence(delta=self.param.delta,
                                              max_count=self.param.maxcount)
        self.outdir = None
        self.output_fasta_enriched = None
        self.is_usearch_on = True

        self.output_fasta = None
        self.output_muscle = None
        self.output_hmm = None

        self.logger = logHandler.Logger(name=__name__)
示例#3
0
文件: path.py 项目: nchenche/cusProSe
    def __init__(self, protein: Protein):
        """

        @param protein: instance of Protein
        """
        self.protein = protein
        self.edges = []

        self.logger = logHandler.Logger(name=__name__)
示例#4
0
    def __init__(self, rule: rule_parser.Rule):
        """
        @param rule: instance of Rule
        """
        self.rule = rule
        self.proteins = []

        self.domain_colors = None

        self.logger = logHandler.Logger(name=__name__)
示例#5
0
    def __init__(self, param):
        """

        @param param: instance of prosecda.lib.parameters
        """
        self.param = param
        self.list = []
        self.outdir = param.outdirname + 'results/'

        self.logger = logHandler.Logger(name=__name__)
示例#6
0
    def __init__(self, _input: str, outdir: str) -> None:
        """
        @param _input: multiple fasta filename
        @param outdir: output directory
        """

        self.input = _input
        self.outdir = outdir if outdir[-1] == '/' else outdir + '/'
        self.output = outdir + '.'.join(
            os.path.basename(self.input).split('.')[0:-1]) + '.clw'

        self.logger = logHandler.Logger(name=__name__)
示例#7
0
    def __init__(self, delta=1, max_count=3):
        """

        @param delta: difference in sequences number used to consider a non-significant change between between compared fasta files
        @param max_count: maximum number of times a non-significant change (delta) is accepted before considering a convergence
        """
        self.number_iter_i = None
        self.number_iter_j = None
        self.is_converged = False
        self.delta = delta
        self.max_count = max_count
        self.counter = 0

        self.logger = logHandler.Logger(name=__name__)
示例#8
0
    def __init__(self, _input: str, identity: float, outdir: str) -> None:
        """

        @param _input: multiple fasta filename
        @param identity: maximum identity threshold used to cluster sequences
        @param outdir: output directory
        """

        self.input = _input
        self.identity = identity
        self.outdir = outdir if outdir[-1] == '/' else outdir + '/'
        self.output = outdir + '.'.join(
            os.path.basename(self.input).split('.')[0:-1]) + '_nr.fa'

        self.logger = logHandler.Logger(name=__name__)
示例#9
0
    def __init__(self, _input: str, name: str, outdir: str) -> None:
        """

        @param _input: clustalw filename
        @param name: HMM profile output name
        @param outdir: output directory
        """

        self.input = _input
        self.name = name
        self.outdir = outdir if outdir[-1] == '/' else outdir + '/'
        self.output = outdir + '.'.join(
            os.path.basename(self.input).split('.')[0:-1]) + '.hmm'

        self.logger = logHandler.Logger(name=__name__)
示例#10
0
    def __init__(self, name: str, rule_def: dict, co_ival=None):
        """

        @param name: name of the rule (i.e. the protein "family" name)
        @param rule_def: dictionary containing criteria to define the rule
        @param co_ival: cutoff threshold of hmmer i-evalue
        """
        self.name = name
        self.rule_def = rule_def
        self.comment = rule_def['COMMENT']
        self.co_ival = co_ival

        self.mandatory_domains = self.parse_mandatory()
        self.forbidden_domains = self.parse_forbidden()

        self.logger = logHandler.Logger(name=__name__)
示例#11
0
def main():
    param = parameters.Param(args=parameters.get_args())
    logger = logHandler.Logger(name='main', outpath=param.outdirname)
    param.description()

    if os.path.isdir(param.fasta_fname):
        fasta_filenames = (x for x in glob.glob(param.fasta_fname + '/' +
                                                '*.fa'))
        if fasta_filenames:
            run_iter_ondir(fasta_filenames=fasta_filenames,
                           param=param,
                           logger=logger)
    else:
        iterhmmbuilder = pipeline.IterHmmBuilder(_input=param.fasta_fname,
                                                 input_db=param.protdb,
                                                 param=param)
        logger.title("Running iterHmmBuilder")
        iterhmmbuilder.run()
示例#12
0
    def __init__(self, name: str, domains: list, check_duplicates=False):
        """

        @param name: name/id of the protein
        @param domains: list of domains (HmmerDomTbl instances) composing the protein
        """

        self.name = name
        if check_duplicates:
            self.domains = self.rm_duplicates(domains=domains)
        else:
            self.domains = domains
        self.architectures = []
        self.best_architecture = None
        self.length = domains[0].tlen
        self.sequence = None

        self.logger = logHandler.Logger(name=__name__)
示例#13
0
    def __init__(self, args):
        self.fasta_fname = args.fa
        self.hmm_name = args.name if args.name else '.'.join(os.path.basename(args.fa).split('.')[0:-1])
        self.protdb = args.protdb
        self.id = args.id
        self.cov = args.cov
        self.cval = args.cval
        self.ival = args.ival
        self.acc = args.acc
        self.delta = args.delta
        self.maxcount = args.maxcount

        outpath = args.out if args.out[-1] == '/' else args.out + '/'
        default_mainname = 'iterhmmbuild_' + date + '/'
        self.outdirname = outpath + default_mainname
        os.makedirs(self.outdirname, exist_ok=True)

        self.logger = logHandler.Logger(name=__name__)
示例#14
0
def main():
    param = parameters.Param(parameters.get_arguments())
    logger = logHandler.Logger(name='prosecda', outpath=param.outdirname)
    param.description()
    # sys.exit(0)

    rules = rule_parser.Parser(input_filename=param.yamlrules,
                               co_ival=param.ival)
    rules.description()
    """ Runs hmmsearch and gets hits from its output (.domtblout format) """
    logger.title('Running hmmsearch...')

    # create an HmmSearch instance
    hmmsearch = external.HmmSearch(input_hmm=param.hmmdb,
                                   input_db=param.proteome_filename,
                                   parameters=param,
                                   outdir=param.outdirname,
                                   basename=os.path.basename(
                                       param.proteome_filename),
                                   domains=rules.list_alldomains())

    # run hmmsearch
    hmmsearch.run()

    # retrieve proteins
    proteins = hmmsearch.get_proteins()

    fasta_dict = seqio.get_fasta_dict(fasta_filename=param.proteome_filename,
                                      protein_ids=[x.name for x in proteins])

    logger.title('Searching for possible domain architectures...')
    for protein in proteins:
        protein.sequence = fasta_dict[protein.name]
        fasta_dict.pop(protein.name,
                       None)  # remove protein.name key from fasta_dict

        protein_architecture_path = path.Path(protein=protein)
        protein_architecture_path.search()
        protein.set_best_architecture()

    logger.title('Searching for proteins matching rules...')
    matches = matching.Matches(param=param)
    matches.search(rules=rules.rules, proteins=proteins)
    matches.report()
示例#15
0
    def __init__(self, args):
        """

        @param args: return of argparse.ArgumentParser.parse_args()
        """
        self.proteome_filename = args.proteome
        self.fasta_dict = seqio.read_fasta(sequences=self.proteome_filename)
        self.hmmdb = args.hmmdb
        self.yamlrules = args.rules

        outpath = args.out if args.out[-1] == '/' else args.out + '/'
        default_mainname = 'prosecda_' + date + '/'
        self.outdirname = outpath + default_mainname
        os.makedirs(self.outdirname, exist_ok=True)

        self.score_co = args.score
        self.cov = args.cov
        self.cval = args.cevalue
        self.ival = args.ievalue
        self.acc = args.acc
        self.nopdf = args.nopdf
        # self.rules = rules.parse_rules(filename=self.yamlrules, score_co=self.score_co)

        self.logger = logHandler.Logger(name=__name__)
示例#16
0
    def __init__(self, input_filename: str, co_ival=None):
        self.input_filename = input_filename
        self.co_ival = co_ival
        self.rules = parse_yaml(input_filename=input_filename, co_ival=co_ival)

        self.logger = logHandler.Logger(name=__name__)
示例#17
0
# -*- coding: utf-8 -*-
# import shutil
import lib.logHandler as logHandler

logger = logHandler.Logger(name=__name__)


def get_fasta_dict(fasta_filename: str = 'name', protein_ids: list = None):
    """

    Parse a protein fasta file to return a dictionary with protein ids as keys and sequences as values.

    @param fasta_filename: fasta filename
    @param protein_ids: optional, list of protein ids to read, ids not in this list will not be considered
    @return: a dictionary (keys=proteins ids, values=sequence)
    """
    fasta_dict = {}
    with open(fasta_filename, 'r') as fasta_file:
        if not protein_ids:
            for line in fasta_file:
                if line.startswith('>'):
                    protein_id = line.split()[0].split('>')[-1]
                    if protein_id not in fasta_dict:
                        fasta_dict[protein_id] = ''
                else:
                    sequence = line.strip().replace('*', '')
                    fasta_dict[protein_id] += sequence
        else:
            for line in fasta_file:
                if line.startswith('>'):
                    protein_id = line.split()[0].split('>')[-1]