示例#1
0
def get_metagenome_cluster_prokka(prokka_dir=None, dest_dir=None, suffix='ffn'):
    """copy all cluster 'ffn' files from remote directory.
    Then prepend file name on froent of header lines,
    and replace all spaces in lines with underscores '_'
    Note: *_dir args should be 'APath' instances
    """
    #TODO: ensure all files named after their cluster (w/o _,-..?) !!
    srce_dir = prokka_dir or APath(CONFIG.get('paths').get('prokka_dir'))
    dest_dir = dest_dir or APath(CONFIG.get('paths').get('working_dir'))
    log.info('Copying and processing Prokka ffn files '
             'from {srce_dir} into {dest_dir}')
    dest_files = []
    assert next(srce_dir.glob('*'+suffix)), f'No matching files in the dir "{srce_dir.abspath}"'
    for ffn in srce_dir.glob('*'+suffix):
        log.info(f'Copying {ffn.name}')
        try:
            dst_fn = dest_dir / ffn.name
            dest_files.append(shutil.copyfile(ffn, dst_fn))
            log.info(f'Prepending "{ffn.stem}" into sequence headers')
            sed_inplace(dst_fn, r'^>', f'>{ffn.stem}_')
            replace_spaces(dst_fn, '_')
        except IOError as e:
            log.error(f'IOError, copying "{e.filename}" to "{e.filename2}": {e}')
            raise e
        except Exception as e:
            log.error(f'Error: {e}')
            raise e
    return dest_files
示例#2
0
def make_blacklists(filepath, suffix='fasta'):
    """make blacklist fasta files for each file in path"""
    log.info('in function make_blacklists')
    try:
        blacklists = []
        fpath = APath(filepath)
        for fa in fpath.glob('*'+suffix):
            blacklists.append( make_blacklist(fpath, fa.name) )
        return blacklists
    except Exception as e:
        log.error(f'Error: {e}')
        raise e
示例#3
0
def generate_musicc_regex(musiccs=None, begin_regex=None):
    """Generate regex pattern for matching MUSiCC patterns.
    List of patterns can be passed or is read from config file.
    'begin_regex' can be character class or other `re` at beginning of pattern.
    """
    try:
        musiccs = musiccs or CONFIG.get('filters').get('musicc_list')
        bgn = begin_regex or CONFIG.get('filters').get('begin_regex')
        log.debug(f'MUSiCC check list: "{musiccs}"')
        mpatt = f'{bgn}(' +'|'.join(musiccs)+ ')'
        log.debug(f'MUSiCC check pattern: "{mpatt}"')
        muser = re.compile(mpatt)
        return muser
    except Exception as e:
        log.error(f'Generating MUSiCC regex match: {e}')
        raise e
示例#4
0
def filter_probe_seqs(dbname, cluster_id, table_name=None):
    """Create db view onto blast results table, limiting on (below default values):
        - dupes
        - pct_identity
        - within GC min>max
        - =40bp length
        - hit on this clust
        - not match tRNA names
    """
    try:
        log.info(f'Filtering headers in db view for {dbname}')

        db = dbname or DB_CFG.get('name')
        table_name = table_name or DB_CFG.get('probes_table').get('name')
        filter_view = DB_CFG.get('probes_view').get('name')

        field_list = DB_CFG.get('probes_view').get('cols').copy()
        field_sql = ', '.join(field_list)

        gc_min = CONFIG.get('gc_percent').get('min_percent')
        gc_max = CONFIG.get('gc_percent').get('max_percent')
        probe_length = CONFIG.get('catch').get('probe_length')
        pct_identity = CONFIG.get('filters').get('pct_identity')

        trna_list = CONFIG.get('filters').get('trna_list')
        trna_wheres = [ f'sseqid NOT LIKE "%{t}%"' for t in trna_list ]
        trna_where_def = ' AND ('+ ' AND '.join(trna_wheres) +')'

        wheres = [f'gc_pct between "{gc_min}" and "{gc_max}"',
                  f'pident={pct_identity}',
                  f'length={probe_length}',
                  f'qseqid like "{cluster_id}%"',
                  ] + trna_wheres
        where_def = ' AND '.join(wheres) + trna_where_def
        group_def = 'qseqid HAVING count(qseqid)=1'

        ddl_view = f'DROP VIEW IF EXISTS {filter_view};'
        ddl_view = (f'CREATE VIEW {filter_view} AS'
                    f' SELECT {field_sql} FROM {table_name}'
                    f' WHERE {where_def} GROUP BY {group_def};')
        # log.debug(f'filtering view query: "{ddl_view}"')
        create_success = Sdb.exec_ddl(db, ddl_view)
        return create_success
    except Exception as e:
        log.error(f'Writing to db "{db}": {e}')
        raise e
示例#5
0
def blast_clust_probes_on_genome(probe_file, blastdb):
    """Run 'blastn' of cluster's probe fasta on genome blastdb.
    Note: probe_file be 'APath' instance, blastdb param is string of filename or filepath.
    """
    log.info(f'Blasting cluster\'s probes ({probe_file}) on genome db {blastdb}')
    try:
        blastn = CONFIG.get('APPS').get('blastn')
        dust   = CONFIG.get('blastn').get('dust', 'no')
        evalue = CONFIG.get('blastn').get('evalue', '10')
        numaln = CONFIG.get('blastn').get('num_alignments', '250')
        numcpu = CONFIG.get('blastn').get('num_threads', '1')
        outfmt = CONFIG.get('blastn').get('outfmt', '10')

        fields = DB_CFG.get('blastn').get('fields').copy()
        extras = CONFIG.get('blastn').get('fields')
        fields += [f for f in extras if f not in fields]
        field_fmt = ' '.join(fields)

        if not probe_file.is_file():
            err_msg = f'Path: "{probe_file.abspath}" is not a file?!'
            log.warning(err_msg)
            return err_msg

        cmd = [blastn,
               '-task', 'blastn',
               '-query', probe_file.abspath,
               '-db', blastdb,
               '-dust', dust,
               '-evalue', evalue,
               '-num_alignments', numaln,
               '-num_threads', numcpu,
               '-outfmt', f'{outfmt} {field_fmt}',
               ]
        output = run_cmd(cmd, only_stdout=True)
        log.notice('blast output: '+output[0:100])

        """blast_rows is rows of all output: here conv'd to list of list-per-line"""
        blast_rows = [ row.split(',') for row in output.splitlines() ]
        # log.notice(f'show blast_rows[0]: {blast_rows[0]}')
        log.info(f'Number of blast matches: {len(blast_rows)}')

    except Exception as e:
        log.error(f'Error: {e}')
        raise e
    else:
        return blast_rows
示例#6
0
def makeblastdb(fastaname, blast_db=None):
    """make blast db from fasta file
    Requires: [makeblastdb]
    """
    log.info(f'Making blastdb for {fastaname}')
    try:
        dest_db = blast_db or fastaname
        mkblastdb = CONFIG.get('APPS').get('blastdb')
        cmd = [mkblastdb,
               '-dbtype', 'nucl',
               '-in', fastaname,
               '-out', dest_db,
               '-logfile', fastaname+'.makeblastdb.log'
               ]
        output = run_cmd(cmd)
    except Exception as e:
        log.error(f'Error: {e}')
        raise e
    else:
        return output
示例#7
0
def catch_design_probes(gbin, dest_dir=None, reuse_existing=False):
    """Design cluster probes using catch app.
    Prepend cluster gbin name into header in resulting sequence files.
    Requires: [catch]
    Note: file, dir args should be 'APath' instances
    """
    log.info(f'Designing probes for {gbin.name}')

    dest_dir = dest_dir or APath(CONFIG.get('paths').get('working_dir'))
    # log.notice(f'reuse_existing: {reuse_existing}')
    try:
        catch_app = CONFIG.get('APPS').get('catch')

        # insert '.probes' into outfile and log names
        probe_out = dest_dir / '.'.join([gbin.stem, 'probes', gbin.suffix[1:]])
        catch_tsv = dest_dir / f'{gbin.stem}.probe_coverage_analysis.tsv'

        if reuse_existing and probe_out.exists():
            log.info(f'Using pre-existing cluster probes file "{probe_out}"')
            return probe_out

        opt_probe_length = str(CONFIG.get('catch').get('probe_length'))
        opt_probe_stride = str(CONFIG.get('catch').get('probe_stride'))
        cmd = [catch_app,
               '--write-analysis-to-tsv', catch_tsv.abspath,
               '--probe-length', opt_probe_length,
               '--probe-stride', opt_probe_stride,
               '--output-probes', probe_out.abspath,
               gbin.abspath,
               ]
        output = run_cmd(cmd)

        log.info(f'Prepending clusterID to seq headers in {probe_out}')
        sed_inplace(probe_out, r'^>', f'>{gbin.stem}_')
    except Exception as e:
        log.error(f'Error: {e}')
        raise e
    else:
        return probe_out
示例#8
0
def make_blacklist(fasta_path, gbin_name, suffix='fasta'):
    """make blacklist fasta file of all 'unwanted' seqs
    i.e. all but the single genome bin fasta
    """
    log.info(f'Making blacklist for {gbin_name}')
    try:
        fpath = APath(fasta_path)
        blacks = [f for f in fpath.glob('*'+suffix)
                  if gbin_name not in f.name]
        blacklist = 'blacklist.' + gbin_name
        try:
            os.remove(blacklist)
        except FileNotFoundError:
            pass
        with open(blacklist, mode='a') as blck:
            for b in blacks:
                with open(b) as bff:
                    blck.write(bff.read())
        return blacklist
    except Exception as e:
        log.error(f'Error: {e}')
        raise e
示例#9
0
def main_pipe(*, config_file:'c'=None, debug=False):
    """Execute the steps of the targeted probe design pipeline

    :param config_file: non-default TOML configuration file to set modified options.
    :param debug: show internal debugging messages and configuration.
    """
    try:
        log.name = 'Targeted_Pipeline'
        log.info('Beginning execution of the targeted design probe pipeline.')

        if debug:
            log.level_name = 'DEBUG'
            for lh in log.handlers:
                lh.level_name = 'DEBUG'

        if config_file:
            log.name = 'Targeted:Read Config Options'
            user_cfg = read_config_file(config_file)
            for k in CONFIG:
                if k in user_cfg:
                    CONFIG[k].update(user_cfg[k])
        else:
            log.notice('Using default configuration. (Install module "clize" for command args.)')

        log.name = 'Targeted:Check Config Options'
        check_options()

        log.name = 'Targeted_Pipeline'
        working_dir = APath(CONFIG.get('paths').get('working_dir'))
        gbin_dir = APath(CONFIG.get('paths').get('genome_bins'))
        gbin_suff = CONFIG.get('general').get('genome_bins_suffix')

        """Make blast dbs for all ffn, if no preexisting designated use_blastdb"""
        log.name = 'Targeted:blastdb'
        use_blastdb = CONFIG.get('paths').get('use_blastdb', None)

        if use_blastdb:
            try:
                use_blastdb_path = APath(use_blastdb)
                blastdb_name = use_blastdb_path.name
                with use_blastdb_path.resolve(strict=True):
                    log.info(f'Using pre-existing blastdb: {use_blastdb_path.abspath}')
                    blast_all_clusters = use_blastdb_path.abspath
                prokka_files = [] # for final cleanup
            except Exception as e:
                log.error(f'Unable to use pre-existing blastdb: {use_blastdb}')
                raise e
        else:
            blastdb_name = DB_CFG.get('blastdb').get('name')
            blastdb_path = working_dir / blastdb_name
            try:
                """Copy cluster prediction files and make blast dbs for each"""
                # log.name = 'Targeted:GetMwgsProkka'
                prokka_dir = APath(CONFIG.get('paths').get('prokka_dir'))
                prokka_suff = CONFIG.get('general').get('prokka_prediction_suffix')
                prokka_files = get_metagenome_cluster_prokka(prokka_dir, working_dir, suffix=prokka_suff)

                log.info(f'Creating blastdb: {blastdb_path.abspath}')
                """concat all clusters' prokka_files into one for blasting"""
                blast_all_clusters = concatenate_files(
                    working_dir.abspath,
                    blastdb_path.abspath,
                    suffix=prokka_suff,
                    clobber=True
                )
                makeblastdb(blast_all_clusters)
            except Exception as e:
                log.error(f'Unable to create blastdb: {blastdb_name}')
                raise e

        """Design probes for genome bin fastas"""
        #TODO: run in parallel, use multiprocessing.Pool ??
        probe_fastas = []
        for gbin in gbin_dir.glob('*'+gbin_suff):
            log.name = 'Targeted Pipeline'
            probe_file = targeted_genome_bin_probes(gbin, blastdb=blast_all_clusters)
            probe_fastas.append(probe_file)
    except Exception as e:
        log.error(f'Error. {e.args}')
        raise e

    else:
        log.name = 'Targeted Pipeline'
        log.notice(f'''Completed this run of targeted probe pipeline!
                   \nConfig options used: {tomlkit.dumps(CONFIG)}''')
        if debug:
            log.notice(f'''\nDatabase Config options used: {tomlkit.dumps(DB_CFG)}''')

        log.info('Finalize by tidying up intermediate files.')
        finalize_outfiles(working_dir,
                          blastdb=blastdb_name,
                          annots=prokka_files,
                          probes=probe_fastas)
示例#10
0
def finalize_outfiles(working_dir='', blastdb=None, annots=[], probes=[]):
    """Check CONFIG settings, delete or compress the intermediate files, then compress logs.

    :param working_dir: string of path to work in
    :param blastdb: name of blastdb created
    :param annote: list of modified annotation/prediction files
    :param probes: list of intial probe fastas created by 'catch'
    """
    log.name = 'Finalizing'
    if not working_dir:
        working_dir = os.cwd()

    keepers = CONFIG.get('general').get('keep_files').copy()
    compress = CONFIG.get('general').get('compress_files')
    file_globs = config.TMP_FILE_GLOBS.copy()

    # special cases of files tracked without 
    argfiles =dict(
        blast_db = blastdb,
        annotation_mods = [p.abspath for p in annots],
        catch_probes = [p.abspath for p in probes],
    )

    if CONFIG.get('paths').get('use_blastdb', None):
        try:
            log.info('Keeping blastdb')
            for ftype in ['blast_db', 'annotation_mods']:
                file_globs.pop(ftype)
                argfiles.pop(ftype)
        except ValueError as e:
            log.error(e)

    for ftype, flist in argfiles.items():
        try:
            if ftype in keepers:
                log.info(f'Tidying up {ftype}')
                file_globs.pop(ftype)
                tidy_up_files(flist, working_dir, True, compress)
        except Exception:
            pass

    if 'target_dbs' in keepers:
        db_name = file_globs.get('target_dbs')
        dbs_glob = '_'.join(['*', db_name])
        try:
            log.info('Vacuuming databases')
            for db in APath(working_dir).glob(dbs_glob):
                Sdb.exec_ddl(db.abspath, 'VACUUM;')
                log.debug(f'Vacuumed database: {db}')
        except Exception:
            pass

    for k, glb in file_globs.items():
        log.info(f'Tidying up {k}')
        glob = ''.join(['*', glb]) if glb else argfiles[k]
        if k in keepers:
            tidy_up_files(glob, working_dir, True, compress)
        else:
            tidy_up_files(glob, working_dir, keep=False)

    # ...and finally compress the logfile
    gzip_compress(log.filename)
示例#11
0
def check_options():
    """check validity of CONFIG settings, try setup if needed"""
    # Check [paths] options, either it exists or create it:
    try:
        log.info('Checking files and directories.')
        paths = [ 'working_dir', 'genome_bins', 'use_blastdb', 'prokka_dir' ]
        path_opts = CONFIG.get('paths')

        path = 'working_dir'
        log.info(f'Checking "{path}"')
        ppath = APath(path_opts.get(path))
        if ppath.is_dir():
            log.info(f'Path: "{ppath.name}" directory found.')
        else:
            log.warning(f'Path for "{ppath.name}" directory not found!')
            try:
                ppath.mkdir(parents=True, exist_ok=True)
                log.notice(f'Path: "{ppath.abspath}" directory created.')
            except FileExistsError as e:
                log.error('File/dir exists.')
                raise e
        path_opts[path] = ppath.abspath

        path = 'genome_bins'
        log.info(f'Checking "{path}"')
        ppath = APath(path_opts.get(path), '')
        assert ppath.is_dir(), f'Path "{ppath}" is not found!'
        log.info(f'Path: "{ppath.abspath}" file found.')
        path_opts[path] = ppath.abspath

        path = 'use_blastdb'
        ppath = path_opts.get(path)
        if ppath:
            log.info(f'Checking "{path}"')
            ppath = APath(path_opts.get(path), '')
            assert ppath.is_file(), f'Path "{ppath}" is not a file!'
            log.info(f'Path: "{ppath.abspath}" file found.')
            path_opts[path] = ppath.abspath
        else:
            path = 'prokka_dir'
            log.info(f'Checking "{path}"')
            ppath = APath(path_opts.get(path), '')
            assert ppath.is_dir(), f'Path "{ppath}" is not found!'
            log.info(f'Path: "{ppath.abspath}" directory found.')
            path_opts[path] = ppath.abspath

    except AssertionError as e:
        log.error(e)
        sys.exit(1)
        raise e
    except Exception as e:
        log.error(e)
        sys.exit(1)
        raise e

    # APP executable checks:
    apps = CONFIG.get('APPS')
    cmd_exists = lambda x: shutil.which(x) is not None
    try:
        log.info('Checking applications usable.')
        log.debug(f'PATH=\"{os.environ.get("PATH")}\"')
        for opt, app in apps.items():
            log.notice(f'App for: "{opt}"')
            if cmd_exists(app):
                log.info(f'App: "{app}" found.')
            else:
                log.warning(f'App: "{app}" is not found?!')
    except Exception as e:
        raise e