def get_avail_accounts(parentdir=None, save=False): """Query slurm with sshare command to determine accounts available. If called with parentdir=None, return all available accounts. - Meant to be called from command line outside of pipeline. See also sys.argv input. If called with parentdir='choose', allow user to choose accounts. - Meant to be called from command line outside of pipeline. See also sys.argv input. If called with save=True, confirm each account with user and save .pkl file in parentdir. - save=True is only called from 00_start.py Returns a list of accounts to balance queue. """ if parentdir is not None and save is False: # if the accounts have already been chosen, just return them right away # keep 'save is False' so 00_start can overwrite previous pkl and skip here pkl = os.path.join(parentdir, 'accounts.pkl') if os.path.exists(pkl): return pklload(pkl) # get a list of all available accounts acctout = subprocess.check_output([ shutil.which('sshare'), '-U', '--user', os.environ['USER'], '--format=Account' ]).decode('utf-8').split('\n') accts = [ acct.split()[0].split("_")[0] for acct in acctout if '_cpu' in acct ] # for running outside of the pipeline: if parentdir is None: # to manually run on command line, using all accounts (default + RAC) return accts elif parentdir == 'choose': # to manually run on command line, choose accounts return choose_accounts(accts) # save if necessary if save is True: # called from 00_start.py keep = choose_accounts(accts) pkldump(keep, os.path.join(parentdir, 'accounts.pkl')) # no return necessary for 00_start.py return return accts
-g --cut_window_size 5 --cut_mean_quality 30 --n_base_limit 20 --length_required 75 \ -h %(html)s.html --cut_by_quality3 --thread 16 --json %(json)s.json \ %(adaptorflag)s > %(logfile)s ''' % locals() newtext = newtext + text suffix = '''# once finished, map using bwa mem python $HOME/gatk_pipeline/02_bwa-map_view_sort_index_flagstat.py %(parentdir)s %(samp)s ''' % locals() text = header + newtext + suffix filE = op.join(shtrimDIR, '%(pool)s-%(samp)s-trim.sh' % locals()) shfiles.append(filE) with open(filE, 'w') as o: o.write("%s" % text) pkldump(samp2_r1r2out, op.join(pooldir, 'samp2_r1r2out.pkl')) print('\tshcount =', len(shfiles)) print('\tshdir = ', shtrimDIR) # qsub the files for sh in shfiles: os.chdir(op.dirname(sh)) # want sbatch outfiles in same folder as sh file print('\tshfile=', sh) subprocess.call([shutil.which('sbatch'), sh]) # os.system('sbatch %s' % sh) time.sleep(2)
module load bedtools/2.27.1 echo -e "\\ncreating coordfile" bedtools bamtobed -i {sortfile} > {coordfile} ''') # get bwatext bwatext = '''''' sortfiles = [] for r1, r2 in r1r2outs: sortfile, text = getbwatext(r1, r2) bwatext = bwatext + text sortfiles.append(sortfile) pkldump(sortfiles, op.join(pooldir, '%s_sortfiles.pkl' % samp)) # send it off email_text = get_email_info(parentdir, '02') text = f'''#!/bin/bash #SBATCH --time=23:59:00 #SBATCH --mem=55000M #SBATCH --nodes=1 #SBATCH --ntasks=32 #SBATCH --cpus-per-task=1 #SBATCH --job-name={pool}-{samp}-bwa #SBATCH --output={pool}-{samp}-bwa_%j.out {email_text} {bwatext}
def get_pars(): choices = ['all', 'fail', 'begin', 'end', 'pipeline-finish'] parser = argparse.ArgumentParser( description=mytext, add_help=False, formatter_class=argparse.RawTextHelpFormatter) requiredNAMED = parser.add_argument_group('required arguments') requiredNAMED.add_argument("-p", required=True, default=argparse.SUPPRESS, dest="parentdir", type=str, help="/path/to/directory/with/fastq.gz-files/") parser.add_argument( "-e", required=False, dest="email", help='''the email address you would like to have notifications sent to''') parser.add_argument( "-n", default=None, nargs='+', required=False, dest="email_options", help='''the type(s) of email notifications you would like to receive from the pipeline. Requires --email-address. These options are used to fill out the #SBATCH flags. Must be one (or multiple) of %s (default: None)''' % [x for x in choices]) parser.add_argument( "-maf", required=False, dest="maf", help='''At the end of the pipeline, VCF files will be filtered for MAF. If the pipeline is run on a single population/pool, the user can set MAF to 0.0 so as to filter variants based on global allele frequency across populations/pools at a later time. (if the number of sample_names in a pool == 1 then default maf=0; Otherwise default maf = 1/sum(ploidy column)''') parser.add_argument( '--translate', required=False, action='store_true', dest="translate", help='''Boolean: true if used, false otherwise. If a stitched genome is used for mapping, this option will look for a ref.order file in the same directory as the ref.fasta - where ref is the basename of the ref.fasta (without the .fasta). The pipeline will use this .order file to translate mapped positions to unstitched positions at the end of the pipeline while filtering. Positions in .order file are assumed to be 1-based indexing. Assumes .order file has no header, and is of the format (contig name from unstitched genome, start/stop are positions in the stitched genome): ref_scaffold<tab>contig_name<tab>start_pos<tab>stop_pos<tab>contig_length (default: False)''') parser.add_argument( '--rm_repeats', required=False, action='store_true', dest='repeats', help='''Boolean: true if used, false otherwise. If repeat regions are available, remove SNPs that fall within these regions from final SNP table and write to a REPEATS table. This option will look for a .txt file in the same directory as the ref.fasta. Assumes the filename is of the form: ref_repeats.txt - where ref is the basename of the ref.fasta (without the .fasta). This file should have 1-based indexing and should be located in the same directory as the reference. The file should have a header ('CHROM', 'start', 'stop'). The CHROM column can be names in the reference (if using unstitched reference), or names of contigs that were stitched to form the reference. If using a stitched genome, --translate is required. (default: False)''') parser.add_argument( '--rm_paralogs', required=False, action='store_true', dest='paralogs', help='''Boolean: true if used, false otherwise. If candidate sites have been isolated within the reference where distinct gene copies (paralogs) map to the same position (and thus create erroneous SNPs), remove any SNPs that fall on these exact sites and write to a PARALOGS file. The pipeline assumes this file is located in the parentdir, andends with '_paralog_snps.txt'. This file is tab-delimited, and must have a column called 'locus' thatcontains hyphen-separated CHROM-POS sites for paralogs. These sites should be found in the current ref.fa being used to call SNPs (otherwise SNPs cannot be filtered by these sites). (default: False)''') parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help='Show this help message and exit.\n') args = parser.parse_args() # trim path if args.parentdir.endswith('/'): args.parentdir = args.parentdir[:-1] # save command pkldump(args, op.join(args.parentdir, 'pipeline_start_command.pkl')) # assess arguments if args.email and args.email_options is None: print(Bcolors.FAIL + 'FAIL: --notification-types are required when specifying email' + Bcolors.ENDC) print(Bcolors.FAIL + 'FAIL: choices = {%s}\n' % [x for x in choices] + Bcolors.ENDC) exit() if args.email_options and args.email is None: print(Bcolors.FAIL + 'FAIL: specifying --notification-types requires specifying \ --email-address\n' + Bcolors.ENDC) exit() if args.email_options: for choice in args.email_options: if not choice.lower() in choices: print( Bcolors.FAIL + '''FAIL: There can be multiple options, but they must be from the set:''' + Bcolors.ENDC) print(Bcolors.FAIL + '''\t%s\n''' % choices + Bcolors.ENDC) exit() if args.email: if '@' not in args.email: print(Bcolors.FAIL + 'FAIL: email address does not have an "@" symbol in it, \ please check input\n' + Bcolors.ENDC) exit() if 'all' in args.email_options: args.email_options = ['all'] # save email epkl = {'email': args.email, 'opts': args.email_options} pkldump(epkl, op.join(args.parentdir, 'email_opts.pkl')) if args.maf: pkldump(args.maf, op.join(args.parentdir, 'maf.pkl')) if args.repeats: text = 'WARN: You have indicated that you want to remove repeats.\n' text = text + 'WARN: Make sure --translate is used if using a stitched reference.\n' text = text + 'WARN: Otherwise this will cause an error.\n' text = text + 'WARN: --repeats assumes that the first column in the repeats file ...\n' text = text + 'WARN: ... are the exact chromosome names found in the ref.fasta, ...\n' text = text + 'WARN: ... or if used with --translate this assumes that the first ...\n' text = text + 'WARN: ... column of the repeats file are names found in the second ...\n' text = text + 'WARN: ... column of the ref.order file used to translate positions.' print(Bcolors.WARNING + text + Bcolors.ENDC) askforinput() return args
def parse_datatable(data, parentdir, translate, repeats, paralogs): """ Checks some assumptions of datatable.txt, create files and dirs for downstream. translate, repeats, and paralogs are boolean. parentdir is a path. """ print(Bcolors.BOLD + '\nReading datatable, getting fastq info' + Bcolors.ENDC) # inititate dictionaries for downstream pipeline rginfo = {} # key=samp vals=rginfo samp2pool = {} # key=samp val=pool poolref = {} # key=pool val=ref.fa ploidy = {} # key=pool val=dict(key=sample: val=sample_ploidy) poolsamps = {} # key=pool val=sampnames f2samp = {} # key=f val=samp f2pool = {} # key=f val=pool adaptors = OrderedDict() # key=samp val={'r1','r2'} val=adaptor warning = [] # whether to print out warning about optional RG info failing = [] # whether to print out failing about required RG info pool2paralogfile = {} # if --rm_paralogs flagged, store file based on pool pool2repeatsfile = {} # if --rm_repeats flagged, store file based on pool pool2translate = {} # if --translate flagged, store file based on pool # make sure there are no blanks where there shouldn't be badcols = [] for column in data.columns: if column not in ['rgid', 'rgpu', 'adaptor_1', 'adaptor_2']: if data[column].isnull().sum() > 0: badcols.append(column) if len(badcols) > 0: print( Bcolors.FAIL + "\tFAIL: Some rows in datable.txt have blank entries in the following columns: " + Bcolors.ENDC) for col in badcols: print(Bcolors.FAIL + "\tFAIL: %s" % col + Bcolors.ENDC) print('exiting 00_start-pipeline.py') exit() # make sure specific words are not in a pool name badnames = [] for pool in uni(data['pool_name']): for keyword in ['SNP', 'REPEAT', 'PARALOG']: if keyword in pool: badnames.append((pool, keyword)) if len(badnames) > 0: print( Bcolors.FAIL + "\tFAIL: Some pool names have characters that could cause errors downstream." + Bcolors.ENDC) print( Bcolors.FAIL + "\tFAIL: Remove the bad characters from pool_names to continue." + Bcolors.ENDC) for pool, keyword in badnames: print(Bcolors.FAIL + "\tFAIL: Remove '%s' from pool_name '%s'." % (keyword, pool)) print('exiting 00_start-pipeline.py') exit() # iterate through datatable for row in data.index: # get variables samp = data.loc[row, 'sample_name'] adaptors[samp] = { 'r1': data.loc[row, 'adaptor_1'], 'r2': data.loc[row, 'adaptor_2'] } pool = data.loc[row, 'pool_name'] pooldir = op.join(parentdir, pool) print('\t{}\tsamp = {}\tpool = {}'.format(row, samp, pool)) if pool not in poolsamps: poolsamps[pool] = [] if samp not in poolsamps[pool]: poolsamps[pool].append(samp) if samp in samp2pool: if samp2pool[samp] != pool: print(Bcolors.FAIL + 'FAIL: there are duplicate sample names with \ different pool assignments: %s' % samp + Bcolors.ENDC) print('exiting') exit() samp2pool[samp] = pool # get ploidy info if pool not in ploidy: ploidy[pool] = {} if samp in ploidy[pool].keys(): if ploidy[pool][samp] != int(data.loc[row, 'ploidy']): text = "FAIL: the ploidy values for sample_name '%s' are not the same" % samp print(Bcolors.FAIL + text + Bcolors.ENDC) exit() ploidy[pool][samp] = int(data.loc[row, 'ploidy']) # get ref.fasta info ref = data.loc[row, 'ref'] if pool in poolref: # make sure each row for a pool specifies the same reference.fa if poolref[pool] != ref: text = "FAIL: Ref genome for samples in %s pool seem to have different paths in datatable" % pool print(Bcolors.FAIL + text + Bcolors.ENDC) print('exiting 00_start-pipeline.py') exit() else: # check assumptions about ref poolref[pool] = check_ref_assumptions(samp, ref) # hangle RG info rginfo[samp] = {} # required RG info for col in ['rglb', 'rgpl', 'rgsm']: # rg info columns if not data.loc[row, col] == data.loc[row, col]: failing.append('%s\t%s' % (samp, col)) rginfo[samp][col] = data.loc[row, col] # optional RG info for col in ['rgid', 'rgpu']: if data.loc[row, col] != data.loc[row, col]: # if nan rginfo[samp][col] = None if samp not in warning: warning.append(samp) else: rginfo[samp][col] = data.loc[row, col] # map between file and pool/samp for f in [ data.loc[row, 'file_name_r1'], data.loc[row, 'file_name_r2'] ]: f2pool[f] = pool f2samp[op.join(pooldir, f)] = samp # handle --rm_paralogs, --translate, --rm_repeats for pool in uni(data['pool_name']): # handle translating stitched genome to unstitched positions pool2translate[pool] = handle_translate(translate, pool2translate, poolref[pool], data, pool) # handle removing SNPs from repeat regions pool2repeatsfile[pool] = handle_repeats(repeats, pool2repeatsfile, poolref[pool], data, pool) # handle removing paralogs pool2paralogfile[pool] = handle_paralogs(paralogs, pool2paralogfile, data, pool, parentdir) # handle fails for rm_repeats/translate/rm_paralogs handle_dict_fails(pool2repeatsfile, pool2translate, pool2paralogfile, repeats, translate, paralogs, data, parentdir) # RG info failing/warnings handle_rg_fails(failing, warning, parentdir, data) pkldump(pool2repeatsfile, op.join(parentdir, 'repeat_regions.pkl')) pkldump(pool2paralogfile, op.join(parentdir, 'paralog_snps.pkl')) pkldump(pool2translate, op.join(parentdir, 'translate_snps.pkl')) pkldump(rginfo, op.join(parentdir, 'rginfo.pkl')) pkldump(ploidy, op.join(parentdir, 'ploidy.pkl')) pkldump(f2samp, op.join(parentdir, 'f2samp.pkl')) pkldump(poolsamps, op.join(parentdir, 'poolsamps.pkl')) pkldump(poolref, op.join(parentdir, 'poolref.pkl')) pkldump(adaptors, op.join(parentdir, 'adaptors.pkl')) pkldump(samp2pool, op.join(parentdir, 'samp2pool.pkl')) return f2pool, poolref
def read_datatable(parentdir): # read in the datatable, save info for later datatable = op.join(parentdir, 'datatable.txt') if not op.exists(datatable): print(Bcolors.FAIL + '''FAIL: the datatable is not in the necessary path: %s FAIL: exiting 00_start-gatk_pipeline.py''' % datatable + Bcolors.ENDC) sys.exit(3) print(Bcolors.BOLD + 'reading datatable, getting fastq info' + Bcolors.ENDC) data = pd.read_csv(datatable, sep='\t') rginfo = {} # key=sampname vals=rginfo samp2pool = {} # key=samp val=pool poolref = {} # key=pool val=ref.fa ploidy = {} # key=pool val=ploidy poolsamps = {} # key=pool val=sampnames f2samp = {} # key=f val=samp f2pool = {} # key=f val=pool adaptors = {} # key=samp val={'r1','r2'} val=adaptor for row in data.index: samp = data.loc[row, 'sample_name'] adaptors[samp] = {'r1': data.loc[row, 'adaptor_1'], 'r2': data.loc[row, 'adaptor_2']} pool = data.loc[row, 'pool_name'] pooldir = op.join(parentdir, pool) print('\t{}\tsamp = {}\tpool = {}'.format(row, samp, pool)) if pool not in poolsamps: poolsamps[pool] = [] if samp not in poolsamps[pool]: poolsamps[pool].append(samp) if samp in samp2pool: if samp2pool[samp] != pool: print(Bcolors.FAIL + 'FAIL: there are duplicate sample names with \ different pool assignments: %s' % samp + Bcolors.ENDC) print('exiting') exit() samp2pool[samp] = pool df = data[data['pool_name'] == pool].copy() if not luni(df['ploidy']) == 1: print(Bcolors.WARNING + "The ploidy values for some elements with pool name '%s' are not the same." % pool + "\n\tHere are the ploidy values: %s" % uni(df['ploidy']) + Bcolors.ENDC) askforinput() if samp not in ploidy: ploidy[samp] = data.loc[row, 'ploidy'] if pool in poolref: if not poolref[pool] == data.loc[row, 'ref']: print("ref genome for samples in %s pool seems to have different paths in datatable.txt" % pool) sys.exit(1) else: ref = data.loc[row, 'ref'] if not op.exists(ref): print('ref for %s does not exist in path: %s' % (samp, ref)) print('exiting 00_start-gatk_pipeline.py') exit() needed = [] for suffix in ['.dict', '.amb', '.ann', '.bwt', '.fai', '.pac', '.sa']: refext = ref + suffix if suffix != '.dict' else ref.split('.fa')[0] + suffix if not op.exists(refext): needed.append(refext) if len(needed) > 0: print(Bcolors.FAIL + 'FAIL: the following extensions of the reference are needed to continue, \ please create these files' + Bcolors.ENDC) for n in needed: print(Bcolors.FAIL + n + Bcolors.ENDC) print('exiting') exit() printneeded = False intdir = op.join(op.dirname(ref), 'intervals') if not op.exists(intdir): printneeded = True elif len([f for f in fs(intdir) if '.list' in f]) == 0: printneeded = True if printneeded is True: print(Bcolors.FAIL + 'FAIL: either the intervals dir doesn not exist or there are not interval.list files\ \nFAIL: intdir should be here: %s' % intdir + Bcolors.ENDC) exit() poolref[pool] = ref rginfo[samp] = {} for col in ['rglb', 'rgpl', 'rgsm']: # rg info columns rginfo[samp][col] = data.loc[row, col] for f in [data.loc[row, 'file_name_r1'], data.loc[row, 'file_name_r2']]: if "__" in f: print(Bcolors.BOLD + Bcolors.FAIL + "FAIL: file names cannot have double underscores, replace __ with _ (single)" + Bcolors.END) exit() f2pool[f] = pool f2samp[op.join(pooldir, f)] = samp pkldump(rginfo, op.join(parentdir, 'rginfo.pkl')) pkldump(ploidy, op.join(parentdir, 'ploidy.pkl')) pkldump(f2samp, op.join(parentdir, 'f2samp.pkl')) pkldump(poolsamps, op.join(parentdir, 'poolsamps.pkl')) pkldump(poolref, op.join(parentdir, 'poolref.pkl')) pkldump(adaptors, op.join(parentdir, 'adaptors.pkl')) pkldump(samp2pool, op.join(parentdir, 'samp2pool.pkl')) return data, f2pool, poolref
def get_pars(): choices = ['all', 'fail', 'begin', 'end', 'pipeline-finish'] parser = argparse.ArgumentParser(description=print(mytext), add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter) requiredNAMED = parser.add_argument_group('required arguments') requiredNAMED.add_argument("-p", required=True, default=argparse.SUPPRESS, dest="parentdir", type=str, help="/path/to/directory/with/fastq.gz-files/") parser.add_argument("-e", required=False, dest="email", help="the email address you would like to have notifications sent to") parser.add_argument("-n", default=argparse.SUPPRESS, nargs='+', required=False, dest="email_options", help='''the type(s) of email notifications you would like to receive from the pipeline.\ Requires --email-address. These options are used to fill out the #SBATCH flags. must be one (or multiple) of %s''' % [x for x in choices]) parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help='Show this help message and exit.\n') args = parser.parse_args() if args.parentdir.endswith('/'): args.parentdir = args.parentdir[:-1] if args.email and args.email_options is None: print(Bcolors.FAIL + 'FAIL: --notification-types are required when specifying email' + Bcolors.ENDC) print(Bcolors.FAIL + 'FAIL: choices = {%s}\n' % [x for x in choices] + Bcolors.ENDC) exit() if args.email_options and args.email is None: print(Bcolors.FAIL + 'FAIL: specifying --notification-types requires specifying \ --email-address\n' + Bcolors.ENDC) exit() if args.email_options: for choice in args.email_options: if not choice.lower() in choices: print(Bcolors.FAIL + '''FAIL: There can be multiple options, but they must be from the set:''' + Bcolors.ENDC) print(Bcolors.FAIL + '''\t%s\n''' % choices + Bcolors.ENDC) exit() if args.email: if '@' not in args.email: print(Bcolors.FAIL + 'FAIL: email address does not have an "@" symbol in it, \ please check input\n' + Bcolors.ENDC) exit() if 'all' in args.email_options: args.email_options = ['all'] # save email epkl = {'email': args.email, 'opts': args.email_options} pkldump(epkl, op.join(args.parentdir, 'email_opts.pkl')) return args