def __init__(self, input_hmm: str, input_db: str, parameters: any, outdir: str, **kwargs: dict): """ @param input_hmm: hmm profile filename @param input_db: name of the fasta to scan against the HMM profile @param parameters: Param instance @param outdir: output directory @param **kwargs: optional argument(s): - 'basename': desired name of the output file - 'domains': list of domain names to be fetched from input_hmm to a temporary new hmmdb """ if 'basename' not in kwargs: self.basename = '.'.join( os.path.basename(input_hmm).split('.')[0:-1]) else: self.basename = '.'.join(str(kwargs['basename']).split('.')[0:-1]) self.param = parameters self.input_db = input_db self.outdir = outdir self.output = outdir + self.basename + '.domtblout' self.hmmfetch = False if 'domains' not in kwargs: self.input_hmm = input_hmm else: self.hmmfetch = True self.domains = kwargs['domains'] self.input_hmm = hmmfetch(hmmdb=input_hmm, keys=self.domains, outdir=self.outdir) self.hits = None self.logger = logHandler.Logger(name=__name__)
def __init__(self, _input: str, input_db: str, param: Param): """ @param _input: seed fasta filename @param input_db: name of the fasta to scan against the HMM profile for the enrichment process @param param: Param instance """ self.input_hmmbuilder = _input self.basename = '.'.join(os.path.basename(_input).split('.')[0:-1]) self.input_db = input_db self.fasta_dict = seqio.get_fasta_dict(fasta_filename=input_db) self.param = param self.iter_index = 1 self.outdir_base = self.param.outdirname self.convergence_status = Convergence(delta=self.param.delta, max_count=self.param.maxcount) self.outdir = None self.output_fasta_enriched = None self.is_usearch_on = True self.output_fasta = None self.output_muscle = None self.output_hmm = None self.logger = logHandler.Logger(name=__name__)
def __init__(self, protein: Protein): """ @param protein: instance of Protein """ self.protein = protein self.edges = [] self.logger = logHandler.Logger(name=__name__)
def __init__(self, rule: rule_parser.Rule): """ @param rule: instance of Rule """ self.rule = rule self.proteins = [] self.domain_colors = None self.logger = logHandler.Logger(name=__name__)
def __init__(self, param): """ @param param: instance of prosecda.lib.parameters """ self.param = param self.list = [] self.outdir = param.outdirname + 'results/' self.logger = logHandler.Logger(name=__name__)
def __init__(self, _input: str, outdir: str) -> None: """ @param _input: multiple fasta filename @param outdir: output directory """ self.input = _input self.outdir = outdir if outdir[-1] == '/' else outdir + '/' self.output = outdir + '.'.join( os.path.basename(self.input).split('.')[0:-1]) + '.clw' self.logger = logHandler.Logger(name=__name__)
def __init__(self, delta=1, max_count=3): """ @param delta: difference in sequences number used to consider a non-significant change between between compared fasta files @param max_count: maximum number of times a non-significant change (delta) is accepted before considering a convergence """ self.number_iter_i = None self.number_iter_j = None self.is_converged = False self.delta = delta self.max_count = max_count self.counter = 0 self.logger = logHandler.Logger(name=__name__)
def __init__(self, _input: str, identity: float, outdir: str) -> None: """ @param _input: multiple fasta filename @param identity: maximum identity threshold used to cluster sequences @param outdir: output directory """ self.input = _input self.identity = identity self.outdir = outdir if outdir[-1] == '/' else outdir + '/' self.output = outdir + '.'.join( os.path.basename(self.input).split('.')[0:-1]) + '_nr.fa' self.logger = logHandler.Logger(name=__name__)
def __init__(self, _input: str, name: str, outdir: str) -> None: """ @param _input: clustalw filename @param name: HMM profile output name @param outdir: output directory """ self.input = _input self.name = name self.outdir = outdir if outdir[-1] == '/' else outdir + '/' self.output = outdir + '.'.join( os.path.basename(self.input).split('.')[0:-1]) + '.hmm' self.logger = logHandler.Logger(name=__name__)
def __init__(self, name: str, rule_def: dict, co_ival=None): """ @param name: name of the rule (i.e. the protein "family" name) @param rule_def: dictionary containing criteria to define the rule @param co_ival: cutoff threshold of hmmer i-evalue """ self.name = name self.rule_def = rule_def self.comment = rule_def['COMMENT'] self.co_ival = co_ival self.mandatory_domains = self.parse_mandatory() self.forbidden_domains = self.parse_forbidden() self.logger = logHandler.Logger(name=__name__)
def main(): param = parameters.Param(args=parameters.get_args()) logger = logHandler.Logger(name='main', outpath=param.outdirname) param.description() if os.path.isdir(param.fasta_fname): fasta_filenames = (x for x in glob.glob(param.fasta_fname + '/' + '*.fa')) if fasta_filenames: run_iter_ondir(fasta_filenames=fasta_filenames, param=param, logger=logger) else: iterhmmbuilder = pipeline.IterHmmBuilder(_input=param.fasta_fname, input_db=param.protdb, param=param) logger.title("Running iterHmmBuilder") iterhmmbuilder.run()
def __init__(self, name: str, domains: list, check_duplicates=False): """ @param name: name/id of the protein @param domains: list of domains (HmmerDomTbl instances) composing the protein """ self.name = name if check_duplicates: self.domains = self.rm_duplicates(domains=domains) else: self.domains = domains self.architectures = [] self.best_architecture = None self.length = domains[0].tlen self.sequence = None self.logger = logHandler.Logger(name=__name__)
def __init__(self, args): self.fasta_fname = args.fa self.hmm_name = args.name if args.name else '.'.join(os.path.basename(args.fa).split('.')[0:-1]) self.protdb = args.protdb self.id = args.id self.cov = args.cov self.cval = args.cval self.ival = args.ival self.acc = args.acc self.delta = args.delta self.maxcount = args.maxcount outpath = args.out if args.out[-1] == '/' else args.out + '/' default_mainname = 'iterhmmbuild_' + date + '/' self.outdirname = outpath + default_mainname os.makedirs(self.outdirname, exist_ok=True) self.logger = logHandler.Logger(name=__name__)
def main(): param = parameters.Param(parameters.get_arguments()) logger = logHandler.Logger(name='prosecda', outpath=param.outdirname) param.description() # sys.exit(0) rules = rule_parser.Parser(input_filename=param.yamlrules, co_ival=param.ival) rules.description() """ Runs hmmsearch and gets hits from its output (.domtblout format) """ logger.title('Running hmmsearch...') # create an HmmSearch instance hmmsearch = external.HmmSearch(input_hmm=param.hmmdb, input_db=param.proteome_filename, parameters=param, outdir=param.outdirname, basename=os.path.basename( param.proteome_filename), domains=rules.list_alldomains()) # run hmmsearch hmmsearch.run() # retrieve proteins proteins = hmmsearch.get_proteins() fasta_dict = seqio.get_fasta_dict(fasta_filename=param.proteome_filename, protein_ids=[x.name for x in proteins]) logger.title('Searching for possible domain architectures...') for protein in proteins: protein.sequence = fasta_dict[protein.name] fasta_dict.pop(protein.name, None) # remove protein.name key from fasta_dict protein_architecture_path = path.Path(protein=protein) protein_architecture_path.search() protein.set_best_architecture() logger.title('Searching for proteins matching rules...') matches = matching.Matches(param=param) matches.search(rules=rules.rules, proteins=proteins) matches.report()
def __init__(self, args): """ @param args: return of argparse.ArgumentParser.parse_args() """ self.proteome_filename = args.proteome self.fasta_dict = seqio.read_fasta(sequences=self.proteome_filename) self.hmmdb = args.hmmdb self.yamlrules = args.rules outpath = args.out if args.out[-1] == '/' else args.out + '/' default_mainname = 'prosecda_' + date + '/' self.outdirname = outpath + default_mainname os.makedirs(self.outdirname, exist_ok=True) self.score_co = args.score self.cov = args.cov self.cval = args.cevalue self.ival = args.ievalue self.acc = args.acc self.nopdf = args.nopdf # self.rules = rules.parse_rules(filename=self.yamlrules, score_co=self.score_co) self.logger = logHandler.Logger(name=__name__)
def __init__(self, input_filename: str, co_ival=None): self.input_filename = input_filename self.co_ival = co_ival self.rules = parse_yaml(input_filename=input_filename, co_ival=co_ival) self.logger = logHandler.Logger(name=__name__)
# -*- coding: utf-8 -*- # import shutil import lib.logHandler as logHandler logger = logHandler.Logger(name=__name__) def get_fasta_dict(fasta_filename: str = 'name', protein_ids: list = None): """ Parse a protein fasta file to return a dictionary with protein ids as keys and sequences as values. @param fasta_filename: fasta filename @param protein_ids: optional, list of protein ids to read, ids not in this list will not be considered @return: a dictionary (keys=proteins ids, values=sequence) """ fasta_dict = {} with open(fasta_filename, 'r') as fasta_file: if not protein_ids: for line in fasta_file: if line.startswith('>'): protein_id = line.split()[0].split('>')[-1] if protein_id not in fasta_dict: fasta_dict[protein_id] = '' else: sequence = line.strip().replace('*', '') fasta_dict[protein_id] += sequence else: for line in fasta_file: if line.startswith('>'): protein_id = line.split()[0].split('>')[-1]