class Preprocess(Task): """ Unzip fastq files, keep only sequence info of those containing only ACGT """ mask = ['family', 'motif'] inputs = {'fastq': File('original', path=lambda x: "{}/{}.fastq.gz".format(download_dir, x['accession']), mask=mask)} targets = {'seq': File('NoN.fastq', root=store_dir, mask=mask), 'cnt': File('NoN.fastq.cnt', root=store_dir, mask=mask)} actions = [(unzip_seq_filter_N, ['#barcode', '$fastq', '$seq', '$cnt'])]
class GetCounts(Task): """Count lines, words and characters in file""" inputs = {'inp': File('text', path=['a.txt', 'b.txt'])} targets = {'out': File('counts.csv')} actions = [ ("(echo line word char file; wc {}) | sed 's/^ \+//;s/ \+/,/g' > {}", ["$inp", "$out"]) ]
class CombineCoverage(Task): mask = ['group', 'sample'] inputs = {'cov': GetCoverage.targets['cov']} targets = {'csv': File('combined.csv', mask = mask), 'pdf': File('pltcov.pdf', mask = mask, root = '.')} actions = [(combine_csvs, ['#cov', '#csv']), ("""echo "library(ggplot2); pdf('{}') ggplot(read.csv('{}'), aes(x = val)) + geom_density(aes(color = factor(sample)))"\ | R --vanilla""", ['$pdf','$csv'])]
class Partition(Task): """ Partition aptamer sequences into motif-containing (fg) and motif-free (bg) based on distance from MOTIF """ mask = ['family'] inputs = {'seq': Preprocess.targets['seq'], #'nbr': File('nbr.txt', root=store_dir) } targets = {'fg': File('fg.txt', mask=mask, root=store_dir), 'bg': File('bg.txt', mask=mask, root=store_dir)} actions = [(partition_aptamers, [fg_type, '$seq', '#motif', 'hahah', '$fg', '$bg', '#barcode'])]
class MergeAdaptivePrecisionRecall(Task): inputs = {'inp': jf_roc_csv} targets = { 'out': File('sim_precision_recall_multi_reg.tsv', root=final_results_dir) } actions = [(combine_csvs, ['#inp', '#out', '\t'])]
class MergeConstrainedPrecisionRecall(Task): inputs = {'inp': jf_const_roc_csv} targets = { 'out': File('sim_precision_recall_single_reg.tsv', root=final_results_dir) } actions = [(combine_csvs, ['#inp', '#out', '\t'])]
class GetBgSeqmers(Task): """ Get seqmers in the motif-free pool""" mask = ['family'] inputs = {'bg': Partition.targets['bg'], 'seq': Preprocess.targets['seq'], 'cnt': Preprocess.targets['cnt'], } targets = {'out': File('seqmer.txt', mask=mask, root=store_dir)} actions = [(gen_bg_seqmers, ['$seq', '$cnt', '$bg', '$out', seqmer_len, '#barcode'])]
pdb_sim.add_param(sim_models, 'model') pdb_sim.add_param(thetas, 'theta') pdb_sim.add_param(sigmasqs, 'sigmasq') pdb_sim.add_param(gammatildes, 'gammatilde') pdb_sim.add_param(tree_sizes, 'tree') pdb_sim.add_param(restrict_alphas) pdb_sim.add_param(pd.read_csv(StringIO(reg_string), comment='#')) pdb_sim.add_param(restrict_folds) pdb_regime = ParamDb("regime") pdb_regime.add_param(sim_regs, 'regime') # Next define JUDI files associated with task: simulate jf_reg = File('regime', param=pdb_regime, root=metadata_dir, path=lambda x: 'regime_{}.csv'.format(x['regime'])) jf_modreg = jf_reg.copy().rename({'regime': 'modreg'}) jf_tree = File( 'newick', param=pdb_tree, root=metadata_dir, path=lambda x: 'drosophila{}.newick'.format(48 if x['tree'] == 'big' else 9)) # ATTN.2 : We don't need to worry about the actual path of the files # JUDI will create them automatically in a hierachry determined # by the sorted order of parameters, e.g., # sim_data_dir/sim_data_tsv/alpha~1/fold~1/gammatilde~0.25/model~OU1/modreg~global/sigmasq~1/theta~1000/tree~small/sim_data.tsv
class AlignFastq(Task): inputs = {'reads': File('orig_fastq', path = path_gen)} targets = {'sai': File('aln.sai')} actions = [('bwa aln {} {} > {}', [REF,'$reads','$sai'])]
class GetCoverage(Task): mask = ['group'] inputs = {'bam': CreateBam.targets['bam']} targets = {'cov': File('cov.csv', mask = mask)} actions = [('(echo val; samtools rmdup {} - | samtools mpileup - | cut -f4) > {}', ['$bam','$cov'])]
class CreateBam(Task): mask = ['group'] inputs = {'reads': AlignFastq.inputs['reads'], 'sai': AlignFastq.targets['sai']} targets = {'bam': File('aln.bam', mask = mask)} actions = [('bwa sampe {} {} {} | samtools view -Sbh - | samtools sort - > {}', [REF,'$sai','$reads','$bam'])]
class CountBgSeqmers(Task): """ Count seqmers in the motif-free pool""" mask = ['family'] inputs = {'inp': GetBgSeqmers.targets['out']} targets = {'out': File('seqmer.txt.cnt', mask=mask, root=store_dir)} actions = [('cat {} | sort | awk \'BEGIN {{OFS="\t"}} ($1 == last || last == "") {{sum += $2}} ($1 != last && last != "") {{print last, sum; sum = $2}} {{last = $1}} END {{print last, sum}}\' > {}', ['$inp', '$out'])]
class CombineCounts(Task): """Combine counts""" mask = ['n'] inputs = {'inp': GetCounts.targets['out']} targets = {'out': File('result.csv', mask=mask, root='.')} actions = [(combine_csvs, ["#inp", "#out"])]
from judi import ParamDb, File racer = ParamDb('racer') racer.add_param(['tortoise', 'hare'], 'racer') racer.add_param([1, 2], 'game') game = ParamDb('game') game.add_param([1, 2], 'game') jf_racer = File('timing.csv', param=racer) jf_game = File('combined_timing.csv', param=game) class simulate(Task): param = racer targets = {'out': jf_racer} actions = [('cp {}_{}.csv {}', ['#racer', '#game', '$out'])] class combine(Task): param = game inputs = {'inp': jf_game} targets = {'out': jf_racer} actions = [(combine_csvs, ['#inp', '#out'])]