예제 #1
0
def run_workflow(workflow, working_dir, db_dir, seqfile, include_groups, jobs,
                 min_score, provirus_off, hallmark_required_on_short,
                 max_orf_per_seq, min_length, tmpdir, verbose, profile, dryrun,
                 use_conda_off, snakemake_args):
    ''' Runs the virsorter main function: to classify viral sequences

    By default all steps are executed. The "classify" rerun classify
    step without previous steps that are computationally heavy. Most
    snakemake arguments can be appended to the command for more info see
    'snakemake --help'. For more details, see: github
    '''
    os.makedirs(working_dir, exist_ok=True)
    config_f = os.path.join(working_dir, 'config.yaml')

    if min_score > 1 or min_score < 0:
        logging.critical('--min-score needs to be between 0 and 1')
    if min_length < 0:
        logging.critical('--min-length needs to be >= 0')
    if jobs < 0:
        logging.critical('--jobs needs to be >= 0')

    if provirus_off:
        provirus = False
    else:
        provirus = True
        max_orf_per_seq = -1

    if workflow == 'classify':
        target_f = '{working_dir}/{tmpdir}/all-fullseq-proba.tsv'.format(
            working_dir=working_dir,
            tmpdir=tmpdir,
        )
        try:
            subprocess.run(['touch', target_f], check=True)
        except subprocess.CalledProcessError as e:
            # removes the traceback
            logging.critical(e)

    make_config(
        db_dir=db_dir,
        seqfile=seqfile,
        include_groups=include_groups,
        threads=jobs,
        config_f=config_f,
        provirus=provirus,
        hallmark_required_on_short=hallmark_required_on_short,
        max_orf_per_seq=max_orf_per_seq,
        tmpdir=tmpdir,
        min_length=min_length,
        min_score=min_score,
    )
    config = load_configfile(config_f)

    if db_dir == None:
        db_dir = config['DBDIR']

    cmd = ('snakemake --snakefile {snakefile} --directory {working_dir} '
           '--jobs {jobs} '
           '--configfile {config_file} --conda-prefix {conda_prefix} '
           '--rerun-incomplete {use_conda_off} --nolock --latency-wait 600'
           ' {profile} {dryrun} {verbose} '
           ' {target_rule} '
           ' {args} ').format(
               snakefile=get_snakefile(),
               working_dir=working_dir,
               jobs=jobs,
               config_file=config_f,
               profile='' if
               (profile is None) else '--profile {}'.format(profile),
               dryrun='--dryrun' if dryrun else '',
               use_conda_off='' if use_conda_off else '--use-conda',
               verbose='' if verbose else '--quiet',
               args=' '.join(snakemake_args),
               target_rule='-R {}'.format(workflow)
               if workflow != 'all' else workflow,
               conda_prefix=os.path.join(db_dir, 'conda_envs'))
    logging.info('Executing: %s' % cmd)
    try:
        subprocess.run(cmd, check=True, shell=True)
    except subprocess.CalledProcessError as e:
        # removes the traceback
        logging.critical(e)
        sys.exit(1)
예제 #2
0
==>  ONLY TO SLURM !!
'''

import os
import sys
from snakemake.utils import read_job_properties
from snakemake import load_configfile

jobscript = sys.argv[-1]
config = sys.argv[1] 
cluster_config = sys.argv[2]
#logger.info(f"INFO: {jobscript} {config} {cluster_config}")

#read_job_propeties def reads the job properties defined in a snakemake jobscript and return a dict containing information about the job
job_properties = read_job_properties(jobscript)
config_properties = load_configfile(config)

cluster_properties = load_configfile(cluster_config)

rule = job_properties['rule']
jobid = job_properties['jobid']
log = rule

#logger.info("INFO job properties:")
#logger.info(job_properties)
#logger.info("INFO cluster properties : ")
#logger.info(cluster_properties)

#"cluster": {"cpus-per-task": 4, "ntasks": 1, "mem-per-cpu": "2", "partition": "normal", "output": "logs/stdout/run_flye/fastq=5percentB1-1", "error": "logs/error/run_flye/fastq=5percentB1-1"}}

# recovery wildcards in variables fastq, assemblers, busco_step 
예제 #3
0
def run_workflow(workflow, working_dir, db_dir, seqfile, include_groups, jobs,
                 min_score, hallmark_required, hallmark_required_on_short,
                 viral_gene_required, provirus_off, max_orf_per_seq,
                 min_length, prep_for_dramv, tmpdir, rm_tmpdir, verbose,
                 profile, dryrun, use_conda_off, snakemake_args, label,
                 keep_original_seq, high_confidence_only, exclude_lt2gene,
                 seqname_suffix_off, viral_gene_enrich_off):
    ''' Runs the virsorter main function to classify viral sequences

    This includes 3 steps: 1) preprocess, 2) feature extraction, and 3)
    classify. By default ("all") all steps are executed. The "classify"
    only run the 3) classify step without previous steps that are
    computationally heavy, good for rerunning with different filtering
    options (--min-score, --high-confidence-only, --hallmark-required,
    --hallmark-required-on-short, --viral-gene-required, --exclude-lt2gene).
    Most snakemake arguments can be appended to the command for more 
    info see 'snakemake --help'.
    '''

    # hard coded, need to change all "iter-0" to Tmpdir in smk
    tmpdir = 'iter-0'

    os.makedirs(working_dir, exist_ok=True)
    config_f = os.path.join(working_dir, 'config.yaml')

    if min_score > 1 or min_score < 0:
        logging.critical('--min-score needs to be between 0 and 1')
        sys.exit(1)
    if min_length < 0:
        logging.critical('--min-length needs to be >= 0')
        sys.exit(1)
    if jobs < 0:
        logging.critical('--jobs needs to be >= 0')
        sys.exit(1)

    if provirus_off:
        provirus = False
        if max_orf_per_seq != -1 and prep_for_dramv:
            mes = ('--max-orf-per-seq CAN NOT be used with '
                   '--prep-for-dramv; '
                   'outputs with ORFs subsampled are NOT '
                   'compatible with DRAMv')
    else:
        provirus = True
        max_orf_per_seq = -1

    if workflow == 'classify':
        if not os.path.exists(config_f):
            mes = 'No config.yaml dectected from previous run'
            logging.critical(mes)
            sys.exit(1)

        config = load_configfile(config_f)
        min_length_prev = config['MIN_LENGTH']
        if min_length != min_length_prev:
            mes = (
                '--min-length has changed from '
                f'{min_length_prev} to {min_length}; '
                'but --min-length has not effect on classify step; '
                'The whole pipeline has to be rerun if --min-length changes')
            logging.critical(mes)
            sys.exit(1)

        if provirus != config['PROVIRUS']:
            mes = (
                '--provirus-off setting change found; '
                'The whole pipeline has to be rerun if --provirus-off changes')
            logging.critical(mes)
            sys.exit(1)

        target_f = '{working_dir}/{tmpdir}/reclassify.trigger'.format(
            working_dir=working_dir,
            tmpdir=tmpdir,
        )
        try:
            subprocess.run(['touch', target_f], check=True)
        except subprocess.CalledProcessError as e:
            # removes the traceback
            #logging.critical(e)
            sys.exit(1)

        os.rename(config_f, os.path.join(working_dir, 'config.yaml.bak'))

    make_config(
        db_dir=db_dir,
        seqfile=seqfile,
        include_groups=include_groups,
        threads=jobs,
        config_f=config_f,
        provirus=provirus,
        hallmark_required=hallmark_required,
        hallmark_required_on_short=hallmark_required_on_short,
        viral_gene_required=viral_gene_required,
        prep_for_dramv=prep_for_dramv,
        max_orf_per_seq=max_orf_per_seq,
        tmpdir=tmpdir,
        min_length=min_length,
        min_score=min_score,
        label=label,
        keep_original_seq=keep_original_seq,
        high_confidence_only=high_confidence_only,
        exclude_lt2gene=exclude_lt2gene,
        seqname_suffix_off=seqname_suffix_off,
        viral_gene_enrich_off=viral_gene_enrich_off,
    )
    config = load_configfile(config_f)

    if db_dir == None:
        db_dir = config['DBDIR']

    cmd = ('snakemake --snakefile {snakefile} --directory {working_dir} '
           '--jobs {jobs} '
           '--configfile {config_file} '
           '--latency-wait 600 '
           '--rerun-incomplete --nolock '
           ' {conda_frontend} {conda_prefix} {use_conda_off} '
           ' {profile} {dryrun} {verbose} '
           ' {target_rule} '
           ' {args} ').format(
               snakefile=get_snakefile(),
               working_dir=working_dir,
               jobs=jobs,
               config_file=config_f,
               profile='' if
               (profile is None) else '--profile {}'.format(profile),
               dryrun='--dryrun' if dryrun else '',
               use_conda_off='' if use_conda_off else '--use-conda',
               conda_frontend=''
               if use_conda_off else '--conda-frontend mamba',
               verbose='' if verbose else '--quiet',
               args=' '.join(snakemake_args),
               target_rule='-R {}'.format(workflow)
               if workflow != 'all' else workflow,
               conda_prefix='' if use_conda_off else
               '--conda-prefix {}'.format(os.path.join(db_dir, 'conda_envs')))
    logging.info('Executing: %s' % cmd)
    try:
        subprocess.run(cmd, check=True, shell=True)
    except subprocess.CalledProcessError as e:
        # removes the traceback
        #logging.critical(e)
        #e.cmd, e.returncode, e.output
        sys.exit(1)

    to_remove = ['.snakemake']
    for di in to_remove:
        _path = os.path.join(working_dir, di)
        shutil.rmtree(_path, ignore_errors=True)
    if rm_tmpdir:
        to_remove = [tmpdir]
        for di in to_remove:
            _path = os.path.join(working_dir, di)
            shutil.rmtree(_path, ignore_errors=True)
예제 #4
0
def run_snakemake(configfile,
                  debugdag,
                  filegraph,
                  workdir,
                  useconda,
                  procs,
                  skeleton,
                  unlock=None,
                  optionalargs=None):
    try:
        logid = scriptname + '.run_snakemake: '
        if skeleton:
            for subdir in [
                    'SubSnakes', 'RAW', 'GENOMES', 'FASTQ', 'LOGS', 'TMP'
            ]:  # Add RAW for nanopore preprocessing
                makeoutdir(subdir)
            sys.exit(
                'Skeleton directories created, please add files and rerun without --skeleton option'
            )
        else:
            for subdir in ['SubSnakes', 'LOGS',
                           'TMP']:  # Add RAW for nanopore preprocessing
                makeoutdir(subdir)

        subdir = 'SubSnakes'
        config = load_configfile(configfile)
        argslist = list()
        if useconda:
            argslist.append("--use-conda")
        else:
            log.warning(
                logid +
                'You are not making use of conda, be aware that this will most likely not work for the workflows provided in this repository! To change append the --use-conda option to your commandline call. Tou can also preinstall all conda environments appending the --use-conda and the --create-envs-only arguments.'
            )
        if debugdag:
            argslist.append("--debug-dag")
        if filegraph:
            argslist.append("--filegraph|dot|display")
        if optionalargs and len(optionalargs) > 0:
            log.debug(logid + 'OPTIONALARGS: ' + str(optionalargs))
            argslist.extend(optionalargs)
            if '--profile' in optionalargs and 'snakes/slurm' in optionalargs:
                makeoutdir('LOGS/SLURM')

        if unlock:
            log.info(logid + 'Unlocking directory')
            jobtorun = 'snakemake --unlock -s {s} --configfile {c}'.format(
                s=os.path.abspath(
                    os.path.join('snakes', 'workflows', 'header.smk')),
                c=configfile)
            log.info(logid + 'RUNNING ' + str(jobtorun))
            job = runjob(jobtorun)
            log.debug(logid + 'JOB CODE ' + str(job))

        preprocess = subworkflows = postprocess = None

        if 'PREPROCESSING' in config:
            preprocess = config['PREPROCESSING'].split(
                ','
            )  # we keep this separate because not all preprocessing steps need extra configuration
            if len(preprocess) == 0 or preprocess[0] == '':
                preprocess = None
        if 'WORKFLOWS' in config:
            subworkflows = config['WORKFLOWS'].split(',')
            if len(subworkflows) == 0 or subworkflows[0] == '':
                subworkflows = None
        if 'POSTPROCESSING' in config:
            postprocess = config['POSTPROCESSING'].split(
                ','
            )  # we keep this separate because not all postprocessing steps need extra configuration
            if len(postprocess) == 0 or postprocess[0] == '':
                postprocess = None

        threads = min(int(config['MAXTHREADS']),
                      procs) if 'MAXTHREADS' in config else procs

        if preprocess:
            try:
                all([config[x] or x == '' for x in preprocess])
            except KeyError:
                log.warning(
                    logid +
                    'Not all required preprocessing steps have configuration in the config file'
                )

        if subworkflows:
            try:
                all([
                    config[x] or x == 'TRIMMING' or x == ''
                    for x in subworkflows
                ])
            except KeyError:
                log.warning(
                    logid +
                    'Not all required subworkflows have configuration in the config file'
                )

        if postprocess:
            try:
                all([config[x] or x == '' for x in postprocess])
            except KeyError:
                log.warning(
                    logid +
                    'Not all required postprocessing steps have configuration in the config file'
                )

        log.debug(logid + 'WORKFLOWS: ' +
                  str([preprocess, subworkflows, postprocess]))
        '''
        Fix conda path if needed
        '''
        condapath = re.compile(r'conda:\s+"')
        '''
        START TO PROCESS
        IF WE NEED TO DOWNLOAD FILES WE DO THIS NOW
        '''

        if preprocess and 'RAW' in preprocess:
            if 'RAW' not in config:
                log.error(
                    logid +
                    'No configuration with key \'RAW\' for file download found. Nothing to do!'
                )
            makeoutdir('FASTQ')
            makeoutdir('TMP')
            preprocess.remove('RAW')
            SAMPLES = download_samples(config)
            log.info(logid + 'PRESAMPLES: ' + str(SAMPLES))
            conditions = get_conditions(
                SAMPLES, config
            )  #[x.split(os.sep) for x in list(set([os.path.dirname(x) for x in samplecond(SAMPLES,config)]))]
            log.info(logid + 'PRECONDITIONS: ' + str(conditions))
            for condition in conditions:
                subconf = NestedDefaultDict()
                subwork = 'RAW'
                listoftools, listofconfigs = create_subworkflow(
                    config, subwork, [condition])
                if listoftools is None:
                    log.warning(logid + 'No entry fits condition ' +
                                str(condition) + ' for preprocessing step ' +
                                str(subwork))
                    continue
                toolenv, toolbin = map(str, listoftools[0])
                subconf.update(listofconfigs[0])
                subname = toolenv + '.smk'
                smkf = os.path.abspath(
                    os.path.join('snakes', 'workflows', 'header.smk'))
                smko = os.path.abspath(
                    os.path.join(
                        subdir, '_'.join([
                            '_'.join(condition), subwork, toolbin,
                            'subsnake.smk'
                        ])))
                if os.path.exists(smko):
                    os.rename(smko, smko + '.bak')
                with open(smko, 'a') as smkout:
                    with open(smkf, 'r') as smk:
                        smkout.write(
                            re.sub(condapath, 'conda:  "../', smk.read()))
                    smkout.write('\n\n')

                smkf = os.path.abspath(
                    os.path.join('snakes', 'workflows', subname))
                with open(smko, 'a') as smkout:
                    with open(smkf, 'r') as smk:
                        smkout.write(
                            re.sub(condapath, 'conda:  "../', smk.read()))
                    smkout.write('\n\n')

                confo = os.path.abspath(
                    os.path.join(
                        subdir, '_'.join([
                            '_'.join(condition), subwork, toolbin,
                            'subconfig.json'
                        ])))
                if os.path.exists(confo):
                    os.rename(confo, confo + '.bak')
                with open(confo, 'a') as confout:
                    json.dump(subconf, confout)

                jobtorun = 'snakemake -j {t} --use-conda -s {s} --configfile {c} --directory {d} --printshellcmds --show-failed-logs {rest}'.format(
                    t=threads,
                    s=os.path.abspath(
                        os.path.join(
                            subdir, '_'.join([
                                '_'.join(condition), subwork, toolbin,
                                'subsnake.smk'
                            ]))),
                    c=os.path.abspath(
                        os.path.join(
                            subdir, '_'.join([
                                '_'.join(condition), subwork, toolbin,
                                'subconfig.json'
                            ]))),
                    d=workdir,
                    rest=' '.join(argslist))
                log.info(logid + 'RUNNING ' + str(jobtorun))
                job = runjob(jobtorun)
                log.debug(logid + 'JOB CODE ' + str(job))
        '''
        ONCE FILES ARE DOWNLOAD WE CAN START PROCESSING
        '''

        SAMPLES = get_samples(config)
        log.info(logid + 'SAMPLES: ' + str(SAMPLES))
        conditions = get_conditions(
            SAMPLES, config
        )  #[x.split(os.sep) for x in list(set([os.path.dirname(x) for x in samplecond(SAMPLES,config)]))]
        log.info(logid + 'CONDITIONS: ' + str(conditions))

        if preprocess:
            log.info(logid + 'STARTING PREPROCESSING')
            if 'QC' in preprocess and 'QC' in config:
                makeoutdir('QC')
            for condition in conditions:
                for subwork in preprocess:
                    subconf = NestedDefaultDict()
                    log.debug(logid + 'PREPROCESS: ' + str(subwork) +
                              ' CONDITION: ' + str(condition))
                    listoftools, listofconfigs = create_subworkflow(
                        config, subwork, [condition])
                    log.debug(logid + str([listoftools, listofconfigs]))
                    if listoftools is None:
                        log.warning(logid + 'No entry fits condition ' +
                                    str(condition) +
                                    ' for preprocessing step ' + str(subwork))
                        continue

                    for i in range(0, len(listoftools)):

                        toolenv, toolbin = map(str, listoftools[i])
                        subconf.update(listofconfigs[i])
                        subsamples = list(set(sampleslong(subconf)))
                        subname = toolenv + '.smk'
                        log.debug(logid + 'PREPROCESS: ' + str(
                            [toolenv, subname, condition, subsamples, subconf])
                                  )

                        smkf = os.path.abspath(
                            os.path.join('snakes', 'workflows', 'header.smk'))
                        smko = os.path.abspath(
                            os.path.join(
                                subdir, '_'.join([
                                    '_'.join(condition), 'pre_' + subwork,
                                    toolbin, 'subsnake.smk'
                                ])))
                        if os.path.exists(smko):
                            os.rename(smko, smko + '.bak')
                        with open(smko, 'a') as smkout:
                            with open(smkf, 'r') as smk:
                                smkout.write(
                                    re.sub(condapath, 'conda:  "../',
                                           smk.read()))
                            smkout.write('\n\n')

                        if subwork == 'QC':
                            subname = toolenv + '_raw.smk'

                        smkf = os.path.abspath(
                            os.path.join('snakes', 'workflows', subname))
                        with open(smko, 'a') as smkout:
                            with open(smkf, 'r') as smk:
                                smkout.write(
                                    re.sub(condapath, 'conda:  "../',
                                           smk.read()))
                            smkout.write('\n\n')

                        confo = os.path.abspath(
                            os.path.join(
                                subdir, '_'.join([
                                    '_'.join(condition), 'pre_' + subwork,
                                    toolbin, 'subconfig.json'
                                ])))
                        if os.path.exists(confo):
                            os.rename(confo, confo + '.bak')
                        with open(confo, 'a') as confout:
                            json.dump(subconf, confout)

                        jobtorun = 'snakemake -j {t} --use-conda -s {s} --configfile {c} --directory {d} --printshellcmds --show-failed-logs {rest}'.format(
                            t=threads,
                            s=os.path.abspath(
                                os.path.join(
                                    subdir, '_'.join([
                                        '_'.join(condition), 'pre_' + subwork,
                                        toolbin, 'subsnake.smk'
                                    ]))),
                            c=os.path.abspath(
                                os.path.join(
                                    subdir, '_'.join([
                                        '_'.join(condition), 'pre_' + subwork,
                                        toolbin, 'subconfig.json'
                                    ]))),
                            d=workdir,
                            rest=' '.join(argslist))
                        log.info(logid + 'RUNNING ' + str(jobtorun))
                        job = runjob(jobtorun)
                        log.debug(logid + 'JOB CODE ' + str(job))

        else:
            log.warning(
                logid +
                'No preprocessing workflows defined! Continuing with workflows!'
            )

        if subworkflows:
            log.info(logid + 'STARTING PROCESSING')
            for condition in conditions:
                smkf = os.path.abspath(
                    os.path.join('snakes', 'workflows', 'header.smk'))
                smko = os.path.abspath(
                    os.path.join(
                        subdir, '_'.join(['_'.join(condition),
                                          'subsnake.smk'])))
                if os.path.exists(smko):
                    os.rename(smko, smko + '.bak')
                with open(smko, 'a') as smkout:
                    with open(smkf, 'r') as smk:
                        smkout.write(
                            re.sub(condapath, 'conda:  "../', smk.read()))
                    smkout.write('\n\n')

                if 'QC' in subworkflows and 'QC' in config:
                    makeoutdir('QC')
                    if 'MAPPING' in subworkflows:
                        with open(smko, 'a') as smkout:
                            smkout.write(
                                'rule themall:\n\tinput: expand("DONE/{file}_mapped",file=samplecond(SAMPLES,config))\n\n'
                            )

                        smkf = os.path.abspath(
                            os.path.join('snakes', 'workflows', 'multiqc.smk'))
                        with open(smko, 'a') as smkout:
                            with open(smkf, 'r') as smk:
                                smkout.write(
                                    re.sub(condapath, 'conda:  "../',
                                           smk.read()))
                            smkout.write('\n\n')

                if 'MAPPING' in subworkflows and 'TRIMMING' not in subworkflows:
                    log.info(
                        logid +
                        'Simulating read trimming as trimming is not part of the workflow!'
                    )
                    makeoutdir('TRIMMED_FASTQ')
                    smkf = os.path.abspath(
                        os.path.join('snakes', 'workflows',
                                     'simulatetrim.smk'))
                    with open(smko, 'a') as smkout:
                        with open(smkf, 'r') as smk:
                            smkout.write(
                                re.sub(condapath, 'conda:  "../', smk.read()))
                        smkout.write('\n\n')

                subconf = NestedDefaultDict()
                for subwork in subworkflows:
                    log.info(logid + 'PREPARING ' + str(subwork))
                    listoftools, listofconfigs = create_subworkflow(
                        config, subwork, [condition])
                    for i in range(0, len(listoftools)):
                        toolenv, toolbin = map(str, listoftools[i])
                        subconf.update(listofconfigs[i])
                        subsamples = list(set(sampleslong(subconf)))
                        subname = toolenv + '.smk'
                        log.debug(logid + 'SUBWORKFLOW: ' + str([
                            subwork, toolenv, subname, condition, subsamples,
                            subconf
                        ]))

                        if subwork == 'QC' and 'TRIMMING' in subworkflows and not 'MAPPING' in subworkflows:
                            subname = toolenv + '_trim.smk'

                        if subwork == 'QC' and not 'TRIMMING' in subworkflows and not 'MAPPING' in subworkflows:
                            subname = toolenv + '_raw.smk'

                        smkf = os.path.abspath(
                            os.path.join('snakes', 'workflows', subname))
                        with open(smko, 'a') as smkout:
                            with open(smkf, 'r') as smk:
                                smkout.write(
                                    re.sub(condapath, 'conda:  "../',
                                           smk.read()))
                            smkout.write('\n\n')

                if 'MAPPING' in subworkflows:
                    smkf = os.path.abspath(
                        os.path.join('snakes', 'workflows', 'mapping.smk'))
                    with open(smko, 'a') as smkout:
                        with open(smkf, 'r') as smk:
                            smkout.write(
                                re.sub(condapath, 'conda:  "../', smk.read()))
                        smkout.write('\n\n')

                confo = os.path.abspath(
                    os.path.join(
                        subdir,
                        '_'.join(['_'.join(condition), 'subconfig.json'])))
                if os.path.exists(confo):
                    os.rename(confo, confo + '.bak')
                with open(confo, 'a') as confout:
                    json.dump(subconf, confout)

            for condition in conditions:
                log.info(logid + 'Starting workflows for condition ' +
                         str(condition))
                jobtorun = 'snakemake -j {t} -s {s} --configfile {c} --directory {d} --printshellcmds --show-failed-logs {rest}'.format(
                    t=threads,
                    s=os.path.abspath(
                        os.path.join(
                            subdir,
                            '_'.join(['_'.join(condition), 'subsnake.smk']))),
                    c=os.path.abspath(
                        os.path.join(
                            subdir,
                            '_'.join(['_'.join(condition),
                                      'subconfig.json']))),
                    d=workdir,
                    rest=' '.join(argslist))
                log.info(logid + 'RUNNING WORKFLOW ' + str(jobtorun))
                job = runjob(jobtorun)
                log.debug(logid + 'JOB CODE ' + str(job))

        else:
            log.warning(logid + 'No subworkflows defined! Nothing to do!')

        if postprocess:
            log.info(logid + 'STARTING POSTPROCESSING WITH SAMPLES ' +
                     str(SAMPLES))

            if 'PEAKS' in config and 'PEAKS' in postprocess:
                CLIP = checkclip(SAMPLES, config)
                log.info(logid + 'Running Peak finding for ' + CLIP +
                         ' protocol')

            for condition in conditions:
                subconf = NestedDefaultDict()
                for subwork in postprocess:
                    if any(subwork == x for x in ['DE', 'DEU', 'DAS']):
                        continue
                    log.debug(logid + 'POSTPROCESS: ' + str(subwork) +
                              ' CONDITION: ' + str(condition))
                    listoftools, listofconfigs = create_subworkflow(
                        config, subwork, [condition])
                    log.debug(logid + str([listoftools, listofconfigs]))
                    if listoftools is None:
                        log.warning(logid + 'No entry fits condition ' +
                                    str(condition) +
                                    ' for postprocessing step ' + str(subwork))
                        continue

                    for i in range(0, len(listoftools)):
                        toolenv, toolbin = map(str, listoftools[i])
                        subconf.update(listofconfigs[i])
                        subname = toolenv + '.smk'
                        subsamples = list(set(sampleslong(subconf)))
                        log.debug(logid + 'POSTPROCESS: ' + str(
                            [toolenv, subname, condition, subsamples, subconf])
                                  )
                        smkf = os.path.abspath(
                            os.path.join('snakes', 'workflows', 'header.smk'))
                        smko = os.path.abspath(
                            os.path.join(
                                subdir, '_'.join([
                                    '_'.join(condition), subwork, toolbin,
                                    'subsnake.smk'
                                ])))
                        if os.path.exists(smko):
                            os.rename(smko, smko + '.bak')
                        with open(smko, 'a') as smkout:
                            with open(smkf, 'r') as smk:
                                smkout.write(
                                    re.sub(condapath, 'conda:  "../',
                                           smk.read()))
                            smkout.write('\n\n')
                        smkf = os.path.abspath(
                            os.path.join('snakes', 'workflows', subname))
                        with open(smko, 'a') as smkout:
                            with open(smkf, 'r') as smk:
                                smkout.write(
                                    re.sub(condapath, 'conda:  "../',
                                           smk.read()))
                            smkout.write('\n\n')

                        confo = os.path.abspath(
                            os.path.join(
                                subdir, '_'.join([
                                    '_'.join(condition), subwork, toolbin,
                                    'subconfig.json'
                                ])))
                        if os.path.exists(confo):
                            os.rename(confo, confo + '.bak')

                        with open(confo, 'a') as confout:
                            json.dump(subconf, confout)

                        jobtorun = 'snakemake -j {t} --use-conda -s {s} --configfile {c} --directory {d} --printshellcmds --show-failed-logs {rest}'.format(
                            t=threads,
                            s=os.path.abspath(
                                os.path.join(
                                    subdir, '_'.join([
                                        '_'.join(condition), subwork, toolbin,
                                        'subsnake.smk'
                                    ]))),
                            c=os.path.abspath(
                                os.path.join(
                                    subdir, '_'.join([
                                        '_'.join(condition), subwork, toolbin,
                                        'subconfig.json'
                                    ]))),
                            d=workdir,
                            rest=' '.join(argslist))
                        log.info(logid + 'RUNNING ' + str(jobtorun))
                        job = runjob(jobtorun)
                        log.debug(logid + 'JOB CODE ' + str(job))

            #THIS SECTION IS FOR DE, DEU, DAS ANALYSIS, WE USE THE CONDITIONS TO MAKE PAIRWISE COMPARISONS
            for analysis in ['DE', 'DEU', 'DAS']:
                if analysis in config and analysis in postprocess:
                    log.info(logid + 'STARTING ' + analysis + ' Analysis...')
                    subwork = analysis
                    subconf = NestedDefaultDict()
                    log.debug(logid + 'SUBWORK: ' + str(subwork) +
                              ' CONDITION: ' + str(conditions))
                    #listoftoolscount, listofconfigscount = create_subworkflow(config, 'COUNTING', conditions) #Counting is now done on per analysis rule to increase freedom for user
                    listoftools, listofconfigs = create_subworkflow(
                        config, subwork, conditions)

                    if listoftools is None:  # or listoftoolscount is None:
                        log.error(logid + 'No entry fits condition ' +
                                  str(conditions) +
                                  ' for postprocessing step ' + str(subwork))

                    for key in config[subwork]['TOOLS']:
                        log.info(logid + '... with Tool: ' + key)
                        toolenv = key
                        toolbin = config[subwork]['TOOLS'][key]
                        #countenv, countbin = map(str,listoftoolscount[0]) #Counting per analysis rule now
                        subconf = NestedDefaultDict()
                        for i in listofconfigs:
                            i[subwork + 'ENV'] = toolenv
                            i[subwork + 'BIN'] = toolbin
                            #i['COUNTBIN'] = 'featureCounts'#This is hard coded where needed for now
                            #i['COUNTENV'] = 'countreads'#This is hard coded where needed for now
                        for i in range(len(listoftools)):
                            subconf = merge_dicts(subconf, listofconfigs[i])

                        #for x in range(0,len(listofconfigscount)): ### muss hier auch noch gefiltert werden?
                        #    subconf = merge_dicts(subconf,listofconfigscount[x])
                        subname = toolenv + '.smk' if toolenv != 'edger' else toolenv + '_' + subwork + '.smk'
                        subsamples = sampleslong(subconf)
                        log.debug(logid + 'POSTPROCESS: ' +
                                  str([toolenv, subname, subsamples, subconf]))

                        smkf = os.path.abspath(
                            os.path.join('snakes', 'workflows', 'header.smk'))
                        smko = os.path.abspath(
                            os.path.join(
                                subdir,
                                '_'.join([subwork, toolenv, 'subsnake.smk'])))
                        if os.path.exists(smko):
                            os.rename(smko, smko + '.bak')
                        with open(smko, 'a') as smkout:
                            with open(smkf, 'r') as smk:
                                smkout.write(
                                    re.sub(condapath, 'conda:  "../',
                                           smk.read()))
                            smkout.write('\n\n')
                        smkf = os.path.abspath(
                            os.path.join('snakes', 'workflows', subname))
                        with open(
                                os.path.abspath(
                                    os.path.join(
                                        subdir, '_'.join([
                                            subwork, toolenv, 'subsnake.smk'
                                        ]))), 'a') as smkout:
                            with open(smkf, 'r') as smk:
                                smkout.write(
                                    re.sub(condapath, 'conda:  "../',
                                           smk.read()))
                            smkout.write('\n\n')

                        confo = os.path.abspath(
                            os.path.join(
                                subdir,
                                '_'.join([subwork, toolenv,
                                          'subconfig.json'])))
                        if os.path.exists(confo):
                            os.rename(confo, confo + '.bak')
                        with open(confo, 'a') as confout:
                            json.dump(subconf, confout)

                        jobtorun = 'snakemake -j {t} --use-conda -s {s} --configfile {c} --directory {d} --printshellcmds --show-failed-logs {rest}'.format(
                            t=threads,
                            s=os.path.abspath(
                                os.path.join(
                                    subdir, '_'.join([
                                        '_'.join(
                                            [subwork, toolenv, 'subsnake.smk'])
                                    ]))),
                            c=os.path.abspath(
                                os.path.join(
                                    subdir, '_'.join([
                                        '_'.join([
                                            subwork, toolenv, 'subconfig.json'
                                        ])
                                    ]))),
                            d=workdir,
                            rest=' '.join(argslist))
                        log.info(logid + 'RUNNING ' + str(jobtorun))
                        job = runjob(jobtorun)
                        log.debug(logid + 'JOB CODE ' + str(job))

        else:
            log.warning(logid +
                        'No postprocessing steps defined! Nothing to do!')

        log.info('Workflows executed without error!')

    except Exception as err:
        exc_type, exc_value, exc_tb = sys.exc_info()
        tbe = tb.TracebackException(
            exc_type,
            exc_value,
            exc_tb,
        )
        log.error(''.join(tbe.format()))
예제 #5
0
if len(sys.argv) < 2:
    level = 'DEBUG'
else:
    level = sys.argv[1]

log = setup_logger(
    name='',
    log_file='stderr',
    logformat='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
    datefmt='%m-%d %H:%M',
    level=level)
logid = 'TESTER: '

config = load_configfile(
    '/home/fall/Work/Tests/SnakemakeTest/SubSnakes/ID_unpaired_std_subconfig.json'
) if len(sys.argv) < 3 else load_configfile(sys.argv[2])
REFERENCE = config['REFERENCE']

file = r'ID/unpaired/std/GSM461177_untreat_paired_subset_r1'
dir = 'Dm6'

SAMPLES = list()
SAMPLES.append(file)

print(logid + 'SAMPLES: ' + str(SAMPLES))

print(logid + 'GENOME: ' + genome(file, config))

print(logid + 'TESTOPTIONS: ' +
      str(tool_params(file, None, config, 'MAPPING')))
예제 #6
0
import os
import sys

from snakemake import load_configfile
from csv import DictReader

config = load_configfile('analysis.yaml')


def get_callers():
    """Get a list of SV callers enabled by a user.
    :returns: (list) of selected SV callers
    """
    callers = []
    for c in config["enable_callers"]:
        if c not in config["callers"]:
            raise AssertionError("SV caller '{}' is not supported!".format(c))
        callers.append(c)
    return callers


def get_filext(fmt):
    """Get file extension(s) given file type/format:
        ['fasta', 'fasta_idx', 'bam', 'bam_idx', 'vcf', 'bcf', 'bed']
    :param fmt: (str) input file format
    :returns: (str) file extension
    """
    if fmt not in config["file_exts"].keys():
        raise AssertionError("Unknown input file format '{}'.".format(fmt.lower()))
    return config["file_exts"][fmt]
예제 #7
0
wrap sbatch jobs based on Snakemake config file
'''
import os
import sys
from snakemake.utils import read_job_properties
from snakemake import load_configfile
import re
# import subprocess

jobscript = sys.argv[-1]
config = sys.argv[1]

job_properties = read_job_properties(jobscript)


config_properties = load_configfile(config)

rule = job_properties['rule']
job_name = '--job-name ' + rule

# jobid = job_properties['jobid']
threads = job_properties['threads']
cpus_per_task = '--cpus-per-task ' + str(threads)

ntasks = '--ntasks 1'

outdir = config_properties['OUTDIR']
logdir = os.path.join(outdir, 'log/cluster')
os.makedirs(logdir, exist_ok=True)
try:
    log = job_properties['params']['log']
예제 #8
0
def run_workflow(workflow, working_dir, db_dir, seqfile, include_groups, jobs,
                 min_score, hallmark_required, hallmark_required_on_short,
                 viral_gene_required, provirus_off, max_orf_per_seq,
                 min_length, tmpdir, rm_tmpdir, verbose, profile, dryrun,
                 use_conda_off, snakemake_args):
    ''' Runs the virsorter main function to classify viral sequences

    This includes 3 steps: 1) preprocess, 2) feature extraction, and 3)
    classify. By default ("all") all steps are executed. The "classify"
    only run the 3) classify step without previous steps that are
    computationally heavy, good for rerunning with different filtering
    options (--min-score, --hallmark-required,
    --hallmark-required-on-short, --viral-gene-required). Most snakemake
    arguments can be appended to the command for more info see
    'snakemake --help'.
    '''

    # hard coded, need to change all "iter-0" to Tmpdir in smk
    tmpdir = 'iter-0'

    os.makedirs(working_dir, exist_ok=True)
    config_f = os.path.join(working_dir, 'config.yaml')

    if min_score > 1 or min_score < 0:
        logging.critical('--min-score needs to be between 0 and 1')
        sys.exit(1)
    if min_length < 0:
        logging.critical('--min-length needs to be >= 0')
        sys.exit(1)
    if jobs < 0:
        logging.critical('--jobs needs to be >= 0')
        sys.exit(1)

    if provirus_off:
        provirus = False
    else:
        provirus = True
        max_orf_per_seq = -1

    if workflow == 'classify':
        target_f = '{working_dir}/{tmpdir}/all-fullseq-proba.tsv'.format(
            working_dir=working_dir,
            tmpdir=tmpdir,
        )
        try:
            subprocess.run(['touch', target_f], check=True)
        except subprocess.CalledProcessError as e:
            # removes the traceback
            logging.critical(e)
            sys.exit(1)

    make_config(
        db_dir=db_dir,
        seqfile=seqfile,
        include_groups=include_groups,
        threads=jobs,
        config_f=config_f,
        provirus=provirus,
        hallmark_required=hallmark_required,
        hallmark_required_on_short=hallmark_required_on_short,
        viral_gene_required=viral_gene_required,
        max_orf_per_seq=max_orf_per_seq,
        tmpdir=tmpdir,
        min_length=min_length,
        min_score=min_score,
    )
    config = load_configfile(config_f)

    if db_dir == None:
        db_dir = config['DBDIR']

    cmd = ('snakemake --snakefile {snakefile} --directory {working_dir} '
           '--jobs {jobs} '
           '--configfile {config_file} {conda_prefix} '
           '--rerun-incomplete {use_conda_off} --nolock --latency-wait 600'
           ' {profile} {dryrun} {verbose} '
           ' {target_rule} '
           ' {args} ').format(
               snakefile=get_snakefile(),
               working_dir=working_dir,
               jobs=jobs,
               config_file=config_f,
               profile='' if
               (profile is None) else '--profile {}'.format(profile),
               dryrun='--dryrun' if dryrun else '',
               use_conda_off='' if use_conda_off else '--use-conda',
               verbose='' if verbose else '--quiet',
               args=' '.join(snakemake_args),
               target_rule='-R {}'.format(workflow)
               if workflow != 'all' else workflow,
               conda_prefix='' if use_conda_off else
               '--conda-prefix {}'.format(os.path.join(db_dir, 'conda_envs')))
    logging.info('Executing: %s' % cmd)
    try:
        subprocess.run(cmd, check=True, shell=True)
    except subprocess.CalledProcessError as e:
        # removes the traceback
        logging.critical(e)
        sys.exit(1)

    if rm_tmpdir:
        tmpdir_path = os.path.join(working_dir, tmpdir)
        shutil.rmtree(tmpdir_path, ignore_errors=True)
예제 #9
0
def create_json_config(configfile, append, template, preprocess, workflows, postprocess, ics, refdir, binaries, procs, genomemap, genomes, genomeext, sequencing, annotation, optionalargs=None):
    # CLEANUP
    oldcnf = os.path.abspath(configfile)
    for oldfile in glob.glob(oldcnf):
        shutil.copy2(oldfile,oldfile+'.bak')
        log.warning(logid+'Found old config file'+oldfile+' created backup of old config '+oldfile+'.bak')

    config = load_configfile(os.path.abspath(template))
    newconf = NestedDefaultDict()
    oldconf = NestedDefaultDict()
    icslist = list()

    todos = ','.join([x for x in [preprocess,workflows,postprocess] if x is not '' ]).split(',')
    for x in todos:
        if x not in config:
            log.error(logid+'Key '+str(x)+' not found in template, please check for typos!')
            sys.exit()

    log.info(logid+'Creating config json for steps '+str(todos))

    genmap = defaultdict()
    if genomemap:
        genmap = {key: value for (key, value) in [x.split(':') for x in genomemap.split(',')]}
        log.debug(logid+'GENOMEMAP: '+str(genmap))
    else:
        if not append:
            log.error(logid+'No mapping of sample-ID to genome-ID found, please provide -m option')
            sys.exit()

    gens = defaultdict()
    if genomes:
        gens = {key: value for (key, value) in [x.split(':') for x in genomes.split(',')]}
        log.debug(logid+'GENOMES: '+str(gens))
    else:
        if not append:
            log.error(logid+'No mapping of genome to genome fasta found, please provide -g option')
            sys.exit()

    genext = defaultdict()
    if genomeext:
        genext = {key: value for (key, value) in [x.split(':') for x in genomeext.split(',')]}
        log.debug(logid+'GENOMEEXTENSION: '+str(genext))
    if ics or append:
        if append:
            oldconf = load_configfile(os.path.abspath(os.path.join(configfile)))
            iteration = -1
            icstemp = ''
            for k,v in list_all_keys_of_dict(oldconf['SAMPLES']):
                iteration+=1
                if k == 'last':
                    icslist.append(icstemp[:-1])
                    if iteration >3:
                        icstemp = icstemp.split(':')[0]+':'
                        iteration = -1
                    else:
                        icstemp=''
                        iteration = -1
                else:
                    icstemp+=k+':'
            if ics:
                for x in ics.split(','):
                    if x not in icslist:
                        icslist.append(x)
        else:
            icslist = ics.split(',')
    else:
        log.error(logid+'IdentifierConditionSetting (ics) not defined!')
        sys.exit()

    log.debug(logid+'List of IdentifierConditionSettings: '+str(icslist))

    seqlist = [s.replace(':',',') for s in sequencing.split(',')]

    if not append:
        #newconf.merge(config)
        newconf['PREPROCESSING'] = preprocess
        newconf['WORKFLOWS'] = workflows
        newconf['POSTPROCESSING'] = postprocess
        newconf['REFERENCE'] = refdir
        newconf['BINS'] = binaries
        newconf['MAXTHREADS'] = str(procs)
        newconf['GENOME'] = NestedDefaultDict()

        for k,v in gens.items():
            newconf['GENOME'][str(k)] = str(v)

        for key in ['NAME','SOURCE','SEQUENCING','SAMPLES']:
            for id,condition,setting in [x.split(':') for x in icslist]:
                if key == 'NAME':
                    if genomeext:
                        for k,v in genext.items():
                            if str(v) is None or str(v) == 'None':
                                v = ''
                            newconf[key][id][condition][setting] = str(v)
                    else:
                        newconf[key][id][condition][setting] = config[key]['id']['condition']['setting']
                elif key == 'SOURCE':
                    if genomemap:
                        for k,v in genmap.items():
                            if v in newconf['GENOME']:
                                newconf[key][id][condition][setting] = str(v)
                    else:
                        newconf[key][id][condition][setting] = config[key]['id']['condition']['setting']
                elif key == 'SEQUENCING':
                    if len(seqlist) > 0:
                        newconf[key][id][condition][setting] = deque(seqlist).popleft()
                    else:
                        newconf[key][id][condition][setting] = config[key]['id']['condition']['setting']
                elif key == 'SAMPLES':
                    samplelist = get_samples_from_dir(id, condition, setting, newconf)
                    log.debug(logid+'SAMPLELIST: '+str(samplelist))
                    if len(samplelist) > 0:
                        newconf[key][id][condition][setting] = samplelist
                    else:
                        newconf[key][id][condition][setting] = config[key]['id']['condition']['setting']
                else:
                    newconf[key][id][condition][setting] = config[key]['id']['condition']['setting']

    else:
        #newconf.merge(oldconfig)

        if preprocess and preprocess not in newconf['PREPROCESSING']:
            newconf['PREPROCESSING'] = str.join(',',list(set(str.join(',',[oldconf['PREPROCESSING'],preprocess]).split(','))))
        if workflows and workflows not in newconf['WORKFLOWS']:
            newconf['WORKFLOWS'] = str.join(',',list(set(str.join(',',[oldconf['WORKFLOWS'],workflows]).split(','))))
        if postprocess and postprocess not in newconf['POSTPROCESSING']:
            newconf['POSTPROCESSING'] = str.join(',',list(set(str.join(',',[oldconf['POSTPROCESSING'],postprocess]).split(','))))
        if refdir and refdir != oldconf['REFERENCE']:
            newconf['REFERENCE'] = refdir
        else:
            newconf['REFERENCE'] = str(oldconf['REFERENCE'])
        if binaries and binaries != oldconf['BINS']:
            newconf['BINS'] = binaries
        else:
            newconf['BINS'] = str(oldconf['BINS'])
        if procs and procs != oldconf['MAXTHREADS']:
            newconf['MAXTHREADS'] = str(procs)
        else:
            newconf['MAXTHREADS'] = str(oldconf['MAXTHREADS'])

        log.debug(logid+'GENOMEMAP: '+str(genomemap)+'\t'+str(genmap))

        if genomes and any([x not in newconf['GENOME'] for x in list(gens.keys())]) or any([[x not in newconf['GENOME'][y] for x in gens[y]] for y in gens.keys()]):
            newconf['GENOME'] = NestedDefaultDict()
            newconf['GENOME'].merge(oldconf['GENOME'])
            for k,v in gens.items():
                newconf['GENOME'][str(k)] = str(v)
        else:
            newconf['GENOME'] = str(oldconf['GENOME'])

        log.debug(logid+'GENOMEMAPCONF: '+str(newconf['GENOME']))

        for key in ['NAME','SOURCE','SAMPLES','SEQUENCING']:
            for id,condition,setting in [x.split(':') for x in icslist]:
                if key == 'NAME' or key == 'SOURCE':
                    try:
                        checkkey=getFromDict(oldconf[key],[id,condition,setting])
                    except:
                        checkkey=list()
                    if len(checkkey) > 0:
                        if key == 'NAME':
                            if genomeext:
                                for k,v in genext.items():
                                    if id in [x for x in find_key_for_value(k,genmap)]:
                                        if str(v) != oldconf[key][id][condition][setting]:
                                            newconf[key][id][condition][setting] = str(v)
                                        else:
                                            newconf[key][id][condition][setting] = oldconf[key][id][condition][setting]
                            else:
                                newconf[key][id][condition][setting] = config[key]['id']['condition']['setting']
                        elif key == 'SOURCE':
                            if genomemap:
                                for k,v in genmap.items():
                                    if v in newconf['GENOME']:
                                        if str(v) != str(oldconf[key][id][condition][setting]):
                                            newconf[key][id][condition][setting] = str(v)
                                        else:
                                            newconf[key][id][condition][setting] = oldconf[key][id][condition][setting]
                            else:
                                newconf[key][id][condition][setting] = config[key]['id']['condition']['setting']
                        else:
                            newconf[key][id][condition][setting] = oldconf[key][id][condition][setting]
                    else:
                        newconf[key][id][condition][setting] = config[key]['id']['condition']['setting']
                elif key == 'SAMPLES':
                    samplelist = get_samples_from_dir(id, condition, setting, oldconf)
                    log.debug(logid+'SAMPLELIST: '+str(samplelist))
                    if len(samplelist) > 0:
                        newconf[key][id][condition][setting] = samplelist
                    else:
                        newconf[key][id][condition][setting] = config[key]['id']['condition']['setting']
                else:
                    newconf[key][id][condition][setting] = config[key]['id']['condition']['setting']

        for do in todos:
            if do not in newconf and do in oldconf:
                newconf[do].merge(oldconf[do])

    """Now we replace the placeholders in the template config with the actual ones or update an existing config with new workflows"""

    log.debug(logid+'NEW: '+str(newconf))

    for do in todos:
        if do not in newconf:
            newconf[do].merge(config[do])

    for key in todos:
        log.debug(logid+'OLD: '+str(key)+'\t'+str(config[key]))

        for id,condition,setting in [x.split(':') for x in icslist]:
            if id not in newconf[key]:
                newconf[key][id] = NestedDefaultDict()
                log.debug(logid+'ID: '+str(newconf[key]))
            if condition not in newconf[key][id]:
                newconf[key][id][condition] = NestedDefaultDict()
                log.debug(logid+'Condition: '+str(newconf[key]))
            if setting not in newconf[key][id][condition]:
                newconf[key][id][condition][setting] = NestedDefaultDict()
                log.debug(logid+'SETTING: '+str(newconf[key]))

            if 'id' in newconf[key]:
                newconf[key][id] = newconf[key].pop('id')
                newconf[key][id][condition] = newconf[key][id].pop('condition')
                newconf[key][id][condition][setting] = newconf[key][id][condition].pop('setting')
            else:
                log.debug(logid+'TODO: '+str(key)+'\t'+str(config[key])+'\t'+str(newconf[key]))
                newconf[key][id][condition][setting].update(config[key]['id']['condition']['setting'])

    print_json(newconf,configfile,annotation)
예제 #10
0
import os
from collections import defaultdict
from snakemake import load_configfile

config = load_configfile('config.yaml')

# This file contains (1) functions used by the snakemake rules
# and (2) global variables accessed by these functions, e.g.
# the dictionaries that organize samples by patient

##################
# FUNCTIONS
##################


def get_samples_by_patient():
    tumors = defaultdict(dict)
    normals = defaultdict()
    with open(config["samples"], "r") as f:
        for l in f:
            if not l.startswith("PATIENT_ID"):
                info = l.strip().split()
                patient = info[0]
                bam = info[2]
                if info[1] == "tumor":
                    sample = get_sample_name(bam)
                    tumors[patient][sample] = bam
                elif info[1] == "normal":
                    normals[patient] = bam
                else:
                    sys.exit("incorrect samples.txt file")