예제 #1
0
def make_blacklists(filepath, suffix='fasta'):
    """make blacklist fasta files for each file in path"""
    log.info('in function make_blacklists')
    try:
        blacklists = []
        fpath = APath(filepath)
        for fa in fpath.glob('*'+suffix):
            blacklists.append( make_blacklist(fpath, fa.name) )
        return blacklists
    except Exception as e:
        log.error(f'Error: {e}')
        raise e
예제 #2
0
def filter_probe_seqs(dbname, cluster_id, table_name=None):
    """Create db view onto blast results table, limiting on (below default values):
        - dupes
        - pct_identity
        - within GC min>max
        - =40bp length
        - hit on this clust
        - not match tRNA names
    """
    try:
        log.info(f'Filtering headers in db view for {dbname}')

        db = dbname or DB_CFG.get('name')
        table_name = table_name or DB_CFG.get('probes_table').get('name')
        filter_view = DB_CFG.get('probes_view').get('name')

        field_list = DB_CFG.get('probes_view').get('cols').copy()
        field_sql = ', '.join(field_list)

        gc_min = CONFIG.get('gc_percent').get('min_percent')
        gc_max = CONFIG.get('gc_percent').get('max_percent')
        probe_length = CONFIG.get('catch').get('probe_length')
        pct_identity = CONFIG.get('filters').get('pct_identity')

        trna_list = CONFIG.get('filters').get('trna_list')
        trna_wheres = [ f'sseqid NOT LIKE "%{t}%"' for t in trna_list ]
        trna_where_def = ' AND ('+ ' AND '.join(trna_wheres) +')'

        wheres = [f'gc_pct between "{gc_min}" and "{gc_max}"',
                  f'pident={pct_identity}',
                  f'length={probe_length}',
                  f'qseqid like "{cluster_id}%"',
                  ] + trna_wheres
        where_def = ' AND '.join(wheres) + trna_where_def
        group_def = 'qseqid HAVING count(qseqid)=1'

        ddl_view = f'DROP VIEW IF EXISTS {filter_view};'
        ddl_view = (f'CREATE VIEW {filter_view} AS'
                    f' SELECT {field_sql} FROM {table_name}'
                    f' WHERE {where_def} GROUP BY {group_def};')
        # log.debug(f'filtering view query: "{ddl_view}"')
        create_success = Sdb.exec_ddl(db, ddl_view)
        return create_success
    except Exception as e:
        log.error(f'Writing to db "{db}": {e}')
        raise e
예제 #3
0
def blast_clust_probes_on_genome(probe_file, blastdb):
    """Run 'blastn' of cluster's probe fasta on genome blastdb.
    Note: probe_file be 'APath' instance, blastdb param is string of filename or filepath.
    """
    log.info(f'Blasting cluster\'s probes ({probe_file}) on genome db {blastdb}')
    try:
        blastn = CONFIG.get('APPS').get('blastn')
        dust   = CONFIG.get('blastn').get('dust', 'no')
        evalue = CONFIG.get('blastn').get('evalue', '10')
        numaln = CONFIG.get('blastn').get('num_alignments', '250')
        numcpu = CONFIG.get('blastn').get('num_threads', '1')
        outfmt = CONFIG.get('blastn').get('outfmt', '10')

        fields = DB_CFG.get('blastn').get('fields').copy()
        extras = CONFIG.get('blastn').get('fields')
        fields += [f for f in extras if f not in fields]
        field_fmt = ' '.join(fields)

        if not probe_file.is_file():
            err_msg = f'Path: "{probe_file.abspath}" is not a file?!'
            log.warning(err_msg)
            return err_msg

        cmd = [blastn,
               '-task', 'blastn',
               '-query', probe_file.abspath,
               '-db', blastdb,
               '-dust', dust,
               '-evalue', evalue,
               '-num_alignments', numaln,
               '-num_threads', numcpu,
               '-outfmt', f'{outfmt} {field_fmt}',
               ]
        output = run_cmd(cmd, only_stdout=True)
        log.notice('blast output: '+output[0:100])

        """blast_rows is rows of all output: here conv'd to list of list-per-line"""
        blast_rows = [ row.split(',') for row in output.splitlines() ]
        # log.notice(f'show blast_rows[0]: {blast_rows[0]}')
        log.info(f'Number of blast matches: {len(blast_rows)}')

    except Exception as e:
        log.error(f'Error: {e}')
        raise e
    else:
        return blast_rows
예제 #4
0
def export_final_sets(dbname, cluster_id, final_probe_amount=1, randomly=True):
    """Export final sets of (possibly random) probe sequences into fasta format;
    one file for 'musicc', one for non.
    """
    log.info(f'Exporting probes for {cluster_id}')

    working_dir = APath(CONFIG.get('paths').get('working_dir'))
    final_amount = int(final_probe_amount) or int(CONFIG.get('general').get('final_probe_amount'))
    random_picks = randomly or CONFIG.get('general').get('final_probe_random')
    filter_view = DB_CFG.get('probes_view').get('name')

    """final_fields taken from config/database/probes_view_cols last words (post-space)"""
    final_fields = [col.split(' ')[-1] for col in DB_CFG.get('probes_view').get('cols').copy()]

    for which, where in (('normal','0'), ('musicc','1')):
        export_bits = '.'.join([cluster_id, 'probes', 'final', which, 'fasta'])
        export_file = working_dir / export_bits
        whim = 'is_musicc='+where

        record_count = Sdb.iter_select(dbname, filter_view, where=whim, fields='count(*) as recs')
        record_count = next(record_count).pop('recs')
        log.debug(f' ... record_count: {record_count}')

        if record_count == 0:
            log.notice(f'No filtered "{which}" probes for cluster "{cluster_id}".')
            write_out_file('', export_file, mode='a') # write empty file
            next

        if random_picks:
            final_amount = final_amount if record_count >= final_amount else record_count
            row_nums = random.sample(range(record_count), k=final_amount)
        else:
            row_nums = [r for r in range(final_amount)]
        log.debug(f' ... row_nums: {row_nums}')

        probes_selector = Sdb.iter_select(dbname, filter_view, where=whim, fields=final_fields)

        log.info(f'Exporting to file {export_file}')
        for row in [row for num, row in enumerate(probes_selector) if num in row_nums]:
            seq = row.pop('probe_seq') # NB: presumption of column name 'probe_seq' in filter view!!
            head = '>' + '|'.join([str(v) for v in row.values()])
            probe_fasta = os.linesep.join([head, seq, '']) # final '' elem appends EOL
            log.debug(f' ... writing to file {export_file}: "{probe_fasta}"')
            write_out_file(probe_fasta, export_file, mode='a')
예제 #5
0
def makeblastdb(fastaname, blast_db=None):
    """make blast db from fasta file
    Requires: [makeblastdb]
    """
    log.info(f'Making blastdb for {fastaname}')
    try:
        dest_db = blast_db or fastaname
        mkblastdb = CONFIG.get('APPS').get('blastdb')
        cmd = [mkblastdb,
               '-dbtype', 'nucl',
               '-in', fastaname,
               '-out', dest_db,
               '-logfile', fastaname+'.makeblastdb.log'
               ]
        output = run_cmd(cmd)
    except Exception as e:
        log.error(f'Error: {e}')
        raise e
    else:
        return output
예제 #6
0
def get_metagenome_cluster_prokka(prokka_dir=None, dest_dir=None, suffix='ffn'):
    """copy all cluster 'ffn' files from remote directory.
    Then prepend file name on froent of header lines,
    and replace all spaces in lines with underscores '_'
    Note: *_dir args should be 'APath' instances
    """
    #TODO: ensure all files named after their cluster (w/o _,-..?) !!
    srce_dir = prokka_dir or APath(CONFIG.get('paths').get('prokka_dir'))
    dest_dir = dest_dir or APath(CONFIG.get('paths').get('working_dir'))
    log.info('Copying and processing Prokka ffn files '
             'from {srce_dir} into {dest_dir}')
    dest_files = []
    assert next(srce_dir.glob('*'+suffix)), f'No matching files in the dir "{srce_dir.abspath}"'
    for ffn in srce_dir.glob('*'+suffix):
        log.info(f'Copying {ffn.name}')
        try:
            dst_fn = dest_dir / ffn.name
            dest_files.append(shutil.copyfile(ffn, dst_fn))
            log.info(f'Prepending "{ffn.stem}" into sequence headers')
            sed_inplace(dst_fn, r'^>', f'>{ffn.stem}_')
            replace_spaces(dst_fn, '_')
        except IOError as e:
            log.error(f'IOError, copying "{e.filename}" to "{e.filename2}": {e}')
            raise e
        except Exception as e:
            log.error(f'Error: {e}')
            raise e
    return dest_files
예제 #7
0
def make_blacklist(fasta_path, gbin_name, suffix='fasta'):
    """make blacklist fasta file of all 'unwanted' seqs
    i.e. all but the single genome bin fasta
    """
    log.info(f'Making blacklist for {gbin_name}')
    try:
        fpath = APath(fasta_path)
        blacks = [f for f in fpath.glob('*'+suffix)
                  if gbin_name not in f.name]
        blacklist = 'blacklist.' + gbin_name
        try:
            os.remove(blacklist)
        except FileNotFoundError:
            pass
        with open(blacklist, mode='a') as blck:
            for b in blacks:
                with open(b) as bff:
                    blck.write(bff.read())
        return blacklist
    except Exception as e:
        log.error(f'Error: {e}')
        raise e
예제 #8
0
def catch_design_probes(gbin, dest_dir=None, reuse_existing=False):
    """Design cluster probes using catch app.
    Prepend cluster gbin name into header in resulting sequence files.
    Requires: [catch]
    Note: file, dir args should be 'APath' instances
    """
    log.info(f'Designing probes for {gbin.name}')

    dest_dir = dest_dir or APath(CONFIG.get('paths').get('working_dir'))
    # log.notice(f'reuse_existing: {reuse_existing}')
    try:
        catch_app = CONFIG.get('APPS').get('catch')

        # insert '.probes' into outfile and log names
        probe_out = dest_dir / '.'.join([gbin.stem, 'probes', gbin.suffix[1:]])
        catch_tsv = dest_dir / f'{gbin.stem}.probe_coverage_analysis.tsv'

        if reuse_existing and probe_out.exists():
            log.info(f'Using pre-existing cluster probes file "{probe_out}"')
            return probe_out

        opt_probe_length = str(CONFIG.get('catch').get('probe_length'))
        opt_probe_stride = str(CONFIG.get('catch').get('probe_stride'))
        cmd = [catch_app,
               '--write-analysis-to-tsv', catch_tsv.abspath,
               '--probe-length', opt_probe_length,
               '--probe-stride', opt_probe_stride,
               '--output-probes', probe_out.abspath,
               gbin.abspath,
               ]
        output = run_cmd(cmd)

        log.info(f'Prepending clusterID to seq headers in {probe_out}')
        sed_inplace(probe_out, r'^>', f'>{gbin.stem}_')
    except Exception as e:
        log.error(f'Error: {e}')
        raise e
    else:
        return probe_out
예제 #9
0
def main_pipe(*, config_file:'c'=None, debug=False):
    """Execute the steps of the targeted probe design pipeline

    :param config_file: non-default TOML configuration file to set modified options.
    :param debug: show internal debugging messages and configuration.
    """
    try:
        log.name = 'Targeted_Pipeline'
        log.info('Beginning execution of the targeted design probe pipeline.')

        if debug:
            log.level_name = 'DEBUG'
            for lh in log.handlers:
                lh.level_name = 'DEBUG'

        if config_file:
            log.name = 'Targeted:Read Config Options'
            user_cfg = read_config_file(config_file)
            for k in CONFIG:
                if k in user_cfg:
                    CONFIG[k].update(user_cfg[k])
        else:
            log.notice('Using default configuration. (Install module "clize" for command args.)')

        log.name = 'Targeted:Check Config Options'
        check_options()

        log.name = 'Targeted_Pipeline'
        working_dir = APath(CONFIG.get('paths').get('working_dir'))
        gbin_dir = APath(CONFIG.get('paths').get('genome_bins'))
        gbin_suff = CONFIG.get('general').get('genome_bins_suffix')

        """Make blast dbs for all ffn, if no preexisting designated use_blastdb"""
        log.name = 'Targeted:blastdb'
        use_blastdb = CONFIG.get('paths').get('use_blastdb', None)

        if use_blastdb:
            try:
                use_blastdb_path = APath(use_blastdb)
                blastdb_name = use_blastdb_path.name
                with use_blastdb_path.resolve(strict=True):
                    log.info(f'Using pre-existing blastdb: {use_blastdb_path.abspath}')
                    blast_all_clusters = use_blastdb_path.abspath
                prokka_files = [] # for final cleanup
            except Exception as e:
                log.error(f'Unable to use pre-existing blastdb: {use_blastdb}')
                raise e
        else:
            blastdb_name = DB_CFG.get('blastdb').get('name')
            blastdb_path = working_dir / blastdb_name
            try:
                """Copy cluster prediction files and make blast dbs for each"""
                # log.name = 'Targeted:GetMwgsProkka'
                prokka_dir = APath(CONFIG.get('paths').get('prokka_dir'))
                prokka_suff = CONFIG.get('general').get('prokka_prediction_suffix')
                prokka_files = get_metagenome_cluster_prokka(prokka_dir, working_dir, suffix=prokka_suff)

                log.info(f'Creating blastdb: {blastdb_path.abspath}')
                """concat all clusters' prokka_files into one for blasting"""
                blast_all_clusters = concatenate_files(
                    working_dir.abspath,
                    blastdb_path.abspath,
                    suffix=prokka_suff,
                    clobber=True
                )
                makeblastdb(blast_all_clusters)
            except Exception as e:
                log.error(f'Unable to create blastdb: {blastdb_name}')
                raise e

        """Design probes for genome bin fastas"""
        #TODO: run in parallel, use multiprocessing.Pool ??
        probe_fastas = []
        for gbin in gbin_dir.glob('*'+gbin_suff):
            log.name = 'Targeted Pipeline'
            probe_file = targeted_genome_bin_probes(gbin, blastdb=blast_all_clusters)
            probe_fastas.append(probe_file)
    except Exception as e:
        log.error(f'Error. {e.args}')
        raise e

    else:
        log.name = 'Targeted Pipeline'
        log.notice(f'''Completed this run of targeted probe pipeline!
                   \nConfig options used: {tomlkit.dumps(CONFIG)}''')
        if debug:
            log.notice(f'''\nDatabase Config options used: {tomlkit.dumps(DB_CFG)}''')

        log.info('Finalize by tidying up intermediate files.')
        finalize_outfiles(working_dir,
                          blastdb=blastdb_name,
                          annots=prokka_files,
                          probes=probe_fastas)
예제 #10
0
def finalize_outfiles(working_dir='', blastdb=None, annots=[], probes=[]):
    """Check CONFIG settings, delete or compress the intermediate files, then compress logs.

    :param working_dir: string of path to work in
    :param blastdb: name of blastdb created
    :param annote: list of modified annotation/prediction files
    :param probes: list of intial probe fastas created by 'catch'
    """
    log.name = 'Finalizing'
    if not working_dir:
        working_dir = os.cwd()

    keepers = CONFIG.get('general').get('keep_files').copy()
    compress = CONFIG.get('general').get('compress_files')
    file_globs = config.TMP_FILE_GLOBS.copy()

    # special cases of files tracked without 
    argfiles =dict(
        blast_db = blastdb,
        annotation_mods = [p.abspath for p in annots],
        catch_probes = [p.abspath for p in probes],
    )

    if CONFIG.get('paths').get('use_blastdb', None):
        try:
            log.info('Keeping blastdb')
            for ftype in ['blast_db', 'annotation_mods']:
                file_globs.pop(ftype)
                argfiles.pop(ftype)
        except ValueError as e:
            log.error(e)

    for ftype, flist in argfiles.items():
        try:
            if ftype in keepers:
                log.info(f'Tidying up {ftype}')
                file_globs.pop(ftype)
                tidy_up_files(flist, working_dir, True, compress)
        except Exception:
            pass

    if 'target_dbs' in keepers:
        db_name = file_globs.get('target_dbs')
        dbs_glob = '_'.join(['*', db_name])
        try:
            log.info('Vacuuming databases')
            for db in APath(working_dir).glob(dbs_glob):
                Sdb.exec_ddl(db.abspath, 'VACUUM;')
                log.debug(f'Vacuumed database: {db}')
        except Exception:
            pass

    for k, glb in file_globs.items():
        log.info(f'Tidying up {k}')
        glob = ''.join(['*', glb]) if glb else argfiles[k]
        if k in keepers:
            tidy_up_files(glob, working_dir, True, compress)
        else:
            tidy_up_files(glob, working_dir, keep=False)

    # ...and finally compress the logfile
    gzip_compress(log.filename)
예제 #11
0
def check_options():
    """check validity of CONFIG settings, try setup if needed"""
    # Check [paths] options, either it exists or create it:
    try:
        log.info('Checking files and directories.')
        paths = [ 'working_dir', 'genome_bins', 'use_blastdb', 'prokka_dir' ]
        path_opts = CONFIG.get('paths')

        path = 'working_dir'
        log.info(f'Checking "{path}"')
        ppath = APath(path_opts.get(path))
        if ppath.is_dir():
            log.info(f'Path: "{ppath.name}" directory found.')
        else:
            log.warning(f'Path for "{ppath.name}" directory not found!')
            try:
                ppath.mkdir(parents=True, exist_ok=True)
                log.notice(f'Path: "{ppath.abspath}" directory created.')
            except FileExistsError as e:
                log.error('File/dir exists.')
                raise e
        path_opts[path] = ppath.abspath

        path = 'genome_bins'
        log.info(f'Checking "{path}"')
        ppath = APath(path_opts.get(path), '')
        assert ppath.is_dir(), f'Path "{ppath}" is not found!'
        log.info(f'Path: "{ppath.abspath}" file found.')
        path_opts[path] = ppath.abspath

        path = 'use_blastdb'
        ppath = path_opts.get(path)
        if ppath:
            log.info(f'Checking "{path}"')
            ppath = APath(path_opts.get(path), '')
            assert ppath.is_file(), f'Path "{ppath}" is not a file!'
            log.info(f'Path: "{ppath.abspath}" file found.')
            path_opts[path] = ppath.abspath
        else:
            path = 'prokka_dir'
            log.info(f'Checking "{path}"')
            ppath = APath(path_opts.get(path), '')
            assert ppath.is_dir(), f'Path "{ppath}" is not found!'
            log.info(f'Path: "{ppath.abspath}" directory found.')
            path_opts[path] = ppath.abspath

    except AssertionError as e:
        log.error(e)
        sys.exit(1)
        raise e
    except Exception as e:
        log.error(e)
        sys.exit(1)
        raise e

    # APP executable checks:
    apps = CONFIG.get('APPS')
    cmd_exists = lambda x: shutil.which(x) is not None
    try:
        log.info('Checking applications usable.')
        log.debug(f'PATH=\"{os.environ.get("PATH")}\"')
        for opt, app in apps.items():
            log.notice(f'App for: "{opt}"')
            if cmd_exists(app):
                log.info(f'App: "{app}" found.')
            else:
                log.warning(f'App: "{app}" is not found?!')
    except Exception as e:
        raise e
예제 #12
0
def targeted_genome_bin_probes(genome_bin, blastdb=None):
    """Generate, process, filter and export probes for a cluster genome bin"""
    log.notice(f'Generating targeted probes for genome bin: {genome_bin.name}')
    working_dir = APath(CONFIG.get('paths').get('working_dir'))
    blast_header = DB_CFG.get('blastn').get('fields').copy()
    blast_extras = CONFIG.get('blastn').get('fields').copy()

    """add in extra non-default blastn fields to the header"""
    for fld in blast_extras:
        if fld not in blast_header:
            blast_header.append(fld)
    blast_header.extend([ 'gc_pct', 'is_musicc' ])

    blastdb = blastdb or makeblastdb(genome_bin)
    cluster_id = genome_bin.stem

    log.name = 'Probe:CatchDesign'
    reuse_existing_probes = CONFIG.get('catch').get('reuse_existing_probe_files')
    probes_file = catch_design_probes(genome_bin, reuse_existing=reuse_existing_probes)

    """probe_blasts is list of all blast matched records (as lists)"""
    log.name = 'Probes:Blast'
    probe_blasts = blast_clust_probes_on_genome(probes_file, blastdb)

    """Calculate GC% for each seq in probes. Append that and seq onto probe_blasts"""
    probe_ids = set( [pb[0] for pb in probe_blasts] )
    probes_gc = {}

    log.name = ('Probe:GC,MUSiCC')
    log.info('Processing blast match sequences for GC%, and the seq hits for MUSiCC')
    musicc_re = generate_musicc_regex()
    for header, seq in read_fasta(probes_file):
        qid = header.replace('>','')
        if qid in probe_ids:
            log.info(f'Processing probe seq id: "{qid}"')
            for pb in probe_blasts:
                if pb[0] == qid:
                    if pb[0] not in probes_gc:
                        log.debug(f' ... Calc GC% on "{qid}"')
                        probes_gc[pb[0]] = pct_gc(seq)
                    pb.append( probes_gc[pb[0]] )
                    # log.debug(f' ... Check MUSiCC on "{pb[1]}"')
                    is_musicc = 1 if musicc_re.search(pb[1]) else 0
                    pb.append( is_musicc )

    """Get list of fields; write to csv file as header"""
    probe_blasts.insert(0, blast_header)
    blast_probe_file = probes_file.with_suffix('.blasts.csv')
    write_out_csv(blast_probe_file.abspath, probe_blasts, append=False)

    """Convert Blast list into field:val dict for db import"""
    log.name = ('Probe:BlastListtPrepImport')
    log.info('Converting list of blast hits to dict for import to db')
    probe_fields = probe_blasts.pop(0) # pop off blast_header record with new columns
    pseqs = []
    if len(probe_fields) == len(probe_blasts[0]):
        for vals in probe_blasts:
            vals_dict = {f:v for (f,v) in zip(probe_fields, vals)}
            pseqs.append(vals_dict)
        log.debug(f'len pseqs: {len(pseqs)}')

        """import blast file to cluster database"""
        log.name = 'Probe:ImportBlast'
        db_name = DB_CFG.get('clusterdb').get('name')
        clust_db = working_dir / '_'.join([cluster_id, db_name])
        log.info(f'Importing blast matches to db "{clust_db}"')
        import_blasts_to_db(pseqs, db_name=clust_db.abspath)

        """Filter resulting table to limits in CONFIG"""
        log.name = 'Probe:FilterView'
        filter_probe_seqs(clust_db.abspath, cluster_id)

        """Create two views, one for SC, one inverse for MC"""
        log.name = 'Probe:ExportFinals'
        final_probe_amount = CONFIG.get('general').get('final_probe_amount')
        log.debug(f' ... final_probe_amount {final_probe_amount}')
        export_final_sets(clust_db.abspath, cluster_id, final_probe_amount=final_probe_amount)

    else:
        log.notice(f'Number of fieldnames({len(probe_fields)}) not equal to'
                   f'number of values({len(probe_blasts[0])})!')
    return probes_file