def trim_paired(args): """ %prog trim in_dir out_dir quality control on the paired reads """ p = OptionParser(trim_paired.__doc__) p.add_option('--pattern_r1', default = '*_R1.fastq', help='filename pattern for forward reads') p.add_option('--pattern_r2', default = '*_R2.fastq', help='filename pattern for reverse reads') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir,out_dir, = args out_path = Path(out_dir) if not out_path.exists(): sys.exit('output dir %s does not exist...'%out_dir) r1_fns = glob('%s/%s'%(in_dir, opts.pattern_r1)) r2_fns = glob('%s/%s'%(in_dir, opts.pattern_r2)) for r1_fn, r2_fn in zip(r1_fns, r2_fns): r1_path = Path(r1_fn) r2_path = Path(r2_fn) prf = '_'.join(r1_path.name.split('_')[0:-1])+'.PE' print(prf) r1_fn_out1 = r1_path.name.replace('R1.fastq', 'trim.R1.fastq') r1_fn_out2 = r1_path.name.replace('R1.fastq', 'unpaired.R1.fastq') r2_fn_out1 = r2_path.name.replace('R2.fastq', 'trim.R2.fastq') r2_fn_out2 = r2_path.name.replace('R2.fastq', 'unpaired.R2.fastq') cmd = 'java -jar $TM_HOME/trimmomatic.jar PE -phred33 %s %s %s %s %s %s TRAILING:20 SLIDINGWINDOW:4:20 MINLEN:40'%(r1_fn,r2_fn,str(out_path/r1_fn_out1),str(out_path/r1_fn_out2),str(out_path/r2_fn_out1),str(out_path/r2_fn_out2)) header = Slurm_header%(10, 10000, prf, prf, prf) header += 'ml trimmomatic\n' header += cmd with open('%s.trim.slurm'%(prf), 'w') as f: f.write(header)
def PredictSlurmGPU(args): """ %prog model_name npyPattern("CM*.npy") job_n generate prediction GPU jobs for all npy files """ p = OptionParser(PredictSlurmGPU.__doc__) p.set_slurm_opts(jn=True) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) mn, npy_pattern, jobn, = args if opts.prefix == 'myjob': print('specify job name prefix!') sys.exit() npys = glob(npy_pattern) print(len(npys)) grps = cutlist(npys, int(jobn)) for gn, grp in grps: st, ed = gn.split('-') ed = int(ed) + 1 gn = '%s-%s' % (st, ed) cmd = "python -m schnablelab.CNN.Predict Predict %s '%s' %s\n" % ( mn, npy_pattern, gn) opt = '%s.%s' % (opts.prefix, gn) header = Slurm_gpu_header % (opts.time, opts.memory, opt, opt, opt) header += "ml anaconda\nsource activate MCY\n" header += cmd with open('%s.gpu.slurm' % opt, 'w') as f: f.write(header) print('%s.gpu.slurm prediction GPU job file generated!' % opt)
def CallHeightBatch(args): """ %prog imagePattern("CM*.polish.png") generate height call jobs for all polished image files """ p = OptionParser(CallHeightBatch.__doc__) p.set_slurm_opts(array=False) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) pattern, = args all_pngs = glob(pattern) for i in all_pngs: out_prefix = i.split('/')[-1].split('.polish.png')[0] jobname = out_prefix + '.Height' cmd = 'python -m schnablelab.CNN.CallHeight CallHeight %s %s\n' % ( i, out_prefix) header = Slurm_header % (opts.time, opts.memory, jobname, jobname, jobname) header += "ml anaconda\nsource activate %s\n" % opts.env header += cmd jobfile = open('%s.CallHeight.slurm' % out_prefix, 'w') jobfile.write(header) jobfile.close() print('%s.CallHeight.slurm call height job file generated!' % jobname)
def trim_single(args): """ %prog trim in_dir out_dir quality control on the single end reads """ p = OptionParser(trim_paired.__doc__) p.add_option('--pattern', default = '*_Unpaired.fastq', help='filename pattern for all single end reads') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir,out_dir, = args out_path = Path(out_dir) if not out_path.exists(): sys.exit('output dir %s does not exist...'%out_dir) fns = glob('%s/%s'%(in_dir, opts.pattern)) for fn in fns: fn_path = Path(fn) prf = '_'.join(fn_path.name.split('_')[0:-1])+'.SE' print(prf) fn_out = fn_path.name.replace('Unpaired.fastq', 'trim.Unpaired.fastq') cmd = 'java -jar $TM_HOME/trimmomatic.jar SE -phred33 %s %s TRAILING:20 SLIDINGWINDOW:4:20 MINLEN:40'%(fn, str(out_path/fn_out)) header = Slurm_header%(10, 10000, prf, prf, prf) header += 'ml trimmomatic\n' header += cmd with open('%s.trim.slurm'%(prf), 'w') as f: f.write(header)
def Predict(args): """ %prog model_name npy_pattern('CM*.npy') using your trained model to make predictions on selected npy files. The pred_data is a numpy array object which has the same number of columns as the training data. """ from keras.models import load_model import scipy.misc as sm p = OptionParser(Predict.__doc__) p.add_option( '--range', default='all', help="specify the range of the testing images, hcc job range style") p.add_option('--opf', default='infer', help="specify the prefix of the output file names") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) model, npy_pattern = args opf = model.split('/')[-1].split( '.')[0] if opts.opf == 'infer' else opts.opf npys = glob(npy_pattern) if opts.range != 'all': start = int(opts.range.split('-')[0]) end = int(opts.range.split('-')[1]) npys = npys[start:end] print('%s npys will be predicted this time.' % len(npys)) my_model = load_model(model) for npy in npys: print(npy) test_npy = np.load(npy) npy_shape = test_npy.shape test_npy_2d = test_npy.reshape(npy_shape[0] * npy_shape[1], npy_shape[2]) print('testing data shape:', test_npy_2d.shape) pre_prob = my_model.predict(test_npy_2d) predictions = pre_prob.argmax(axis=1) # this is a numpy array predictions = predictions.reshape(npy_shape[0], npy_shape[1]) df = pd.DataFrame(predictions) df1 = df.replace(0, 255).replace(1, 127).replace(2, 253).replace( 3, 190) #0: background; 1: leaf; 2: stem; 3: panicle df2 = df.replace(0, 255).replace(1, 201).replace(2, 192).replace(3, 174) df3 = df.replace(0, 255).replace(1, 127).replace(2, 134).replace(3, 212) arr = np.stack([df1.values, df2.values, df3.values], axis=2) opt = npy.split('/')[-1].split('.npy')[0] + '.prd' sm.imsave('%s.%s.png' % (opf, opt), arr) print('Done!')
def Plot(args): """ %prog dir plot training process You can load the dict back using pickle.load(open('*.p', 'rb')) """ p = OptionParser(Plot.__doc__) p.add_option( "--pattern", default="History_*.p", help= "specify the pattern of your pickle object file, remember to add quotes [default: %default]" ) p.set_slurm_opts() opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) mydir, = args pickles = glob('%s/%s' % (mydir, opts.pattern)) print('total %s pickle objects.' % len(pickles)) #print(pickles) for p in pickles: fs, es = opts.pattern.split('*') fn = p.split(fs)[-1].split(es)[0] myp = pickle.load(open(p, 'rb')) mpl.rcParams['figure.figsize'] = [7.5, 3.25] fig, axes = plt.subplots(nrows=1, ncols=2) # summarize history for accuracy ax1 = axes[0] ax1.plot(myp['acc']) ax1.plot(myp['val_acc']) ax1.set_title('model accuracy') ax1.set_ylabel('accuracy') ax1.set_xlabel('epoch') ax1.set_ylim(0, 1.01) ax1.legend(['train', 'validation'], loc='lower right') max_acc = max(myp['val_acc']) # summarize history for loss ax2 = axes[1] ax2.plot(myp['loss']) ax2.plot(myp['val_loss']) ax2.set_title('model loss') ax2.set_ylabel('loss') ax2.set_xlabel('epoch') ax2.legend(['train', 'validation'], loc='upper right') plt.tight_layout() plt.savefig('%s_%s.png' % (max_acc, fn)) plt.clf()
def submit(args): """ %prog dir Submit part of job in the dir or all jobs """ p = OptionParser(submit.__doc__) p.add_option( "--pattern", default="*.slurm", help= "specify the patter of your slurm job, remember to add quotes [default: %default]" ) p.add_option( "--partition", default='jclarke', choices=('batch', 'jclarke', 'gpu', 'schnablelab'), help= "choose which partition you are going to submit [default: %default]") p.add_option( "--range", default='all', help= "how many jobs you gonna submit this time. exp: '1-10', '11-20', 'all'. 1-based coordinate" ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args #partition = '' if opts.partition=='batch' else '-p %s'%opts.partition partition = '-p %s' % opts.partition alljobs = [ 'sbatch %s %s' % (partition, i) for i in glob(folder, opts.pattern) ] print("Total %s jobs under '%s'" % (len(alljobs), folder)) if opts.range == 'all': for i in alljobs: print(i) call(i, shell=True) else: start, end = int(opts.range.split('-')[0]), int( opts.range.split('-')[1]) if end <= len(alljobs): for i in alljobs[start - 1:end]: print(i) call(i, shell=True) print('%s of total %s were submitted. [%s to %s] this time.' \ %(len(alljobs[start-1 : end]), len(alljobs), start, end)) else: print('jobs exceed the limit')
def combineFQ(args): """ %prog combineFQ pattern(with quotation) fn_out """ p = OptionParser(combineFQ.__doc__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) fq_pattern, fn_out, = args fns = glob(fq_pattern) cmd = 'cat %s > %s'%(' '.join(fns), fn_out) print(cmd) run(cmd, shell=True)
def Imgs2ArrsBatch(args): """ %prog HyperDirPattern("CM*") generate img2arr jobs for all hyperspectral image dirs """ p = OptionParser(Imgs2ArrsBatch.__doc__) p.set_slurm_opts() opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) pattern, = args all_dirs = [i for i in glob(pattern) if os.path.isdir(i)] for i in all_dirs: cmd = 'python -m schnablelab.CNN.Predict Imgs2Arrs %s\n'%i jobname = i+'.img2npy' header = Slurm_header%(opts.time, opts.memory, jobname, jobname, jobname) #header += "ml anaconda\nsource activate MCY\n" header += cmd jobfile = open('%s.img2arr.slurm'%i, 'w') jobfile.write(header) jobfile.close() print('slurm job for %s has been generated.'%i)
def PlantHullBatch(args): """ %prog PlantHullBatch Pattern("*.png") job_n generate PlantHull jobs for all image files """ p = OptionParser(PlantHullBatch.__doc__) p.add_option('--mode', default='real', choices=['real', 'simu'], help="real image or simulated image.") p.set_slurm_opts() opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) pattern, jobn, = args all_imgs = glob(pattern) all_cmds = [] for img in all_imgs: imgpath = Path(img) outpre = str(imgpath.stem) cmd = 'python -m schnablelab.ImgPros.Preprocess PlantHull %s --crop True --segmentation True --border 80,10,10,10\n' % (img) \ if opts.mode=='real' \ else 'python -m schnablelab.ImgPros.Preprocess PlantHull %s --border 0,40,10,0 --thresh_cutoff 160\n' % (img) print(cmd) all_cmds.append(cmd) grps = cutlist(all_cmds, int(jobn)) for gn, grp in grps: header = Slurm_header % (opts.time, opts.memory, gn, gn, gn) header += "ml anaconda\nsource activate MCY\n" for cmd in grp: header += cmd jobname = '%s.ppnum.slurm' % (gn) jobfile = open(jobname, 'w') jobfile.write(header) jobfile.close() print('%s job file generated!' % jobname)
from schnablelab.apps.base import glob from pathlib import Path import sys from subprocess import run pfx_r1 = glob('*_trim.R1.fastq') pfx_r2 = glob('*_trim.R2.fastq') pfx_un = glob('*_trim.Unpaired.fastq') for r1, r2, un in zip(pfx_r1, pfx_r2, pfx_un): print(r1, r2) sm1 = r1.split('_trim')[0] sm2 = r2.split('_trim')[0] if sm1 == sm2: cmd = 'python -m schnablelab.SNPcalling.Preprocess align /work/schnablelab/cmiao/TimeSeriesGWAS/Genotype_GBS/Reference_Genome_4th/Sbicolor_454_v3.0.1 %s %s' % ( r1, r2) run(cmd, shell=True) else: sys.exit('sm1 != sm2') print(un) cmd1 = 'python -m schnablelab.SNPcalling.Preprocess align /work/schnablelab/cmiao/TimeSeriesGWAS/Genotype_GBS/Reference_Genome_4th/Sbicolor_454_v3.0.1 %s' % un run(cmd1, shell=True)