def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.get_temp_dir(".") databases = " ".join(P.as_list(P.get_params()["tomtom_databases"])) target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "tomtom", outfile) if iotools.is_empty(infile): E.warn("input is empty - no computation performed") iotools.touch_file(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run(statement) # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def create_files(outfile): E.debug("creating output file {}".format(outfile)) with open(outfile, "w") as outf: outf.write("\n".join( map( str, numpy.random.normal(P.get_params()["mu"], P.get_params()["sigma"], P.get_params()["num_samples"]))) + "\n")
def runGLAM2(infile, outfile, dbhandle): '''run glam2 on all intervals and motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker ''' to_cluster = True target_path = os.path.join( os.path.abspath(P.get_params()["exportdir"]), "glam2", outfile) track = infile[:-len(".fasta")] tmpdir = tempfile.mkdtemp() tmpfasta = os.path.join(tmpdir, "in.fa") nseq = motifs.writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, halfwidth=int( P.get_params()["meme_halfwidth"]), maxsize=int( P.get_params()["meme_max_size"]), proportion=P.get_params()["meme_proportion"]) min_sequences = int(nseq / 10.0) statement = ''' %(execglam2)s -2 -O %(tmpdir)s %(glam2_options)s -z %(min_sequences)i n %(tmpfasta)s > %(outfile)s.log ''' P.run(statement) # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "glam2.txt"), outfile)
def reconcileReads(infile, outfile): if P.get_params()["reconcile"] == 1: in1 = infile in2 = infile.replace(".fastq.1.gz", ".fastq.2.gz") outfile = outfile.replace(".fastq.1.gz", "") statement = """cgat fastqs2fastqs --method=reconcile --output-filename-pattern=%(outfile)s.fastq.%%s.gz %(in1)s %(in2)s""" P.run(statement, job_threads=P.get_params()["threads"], job_memory="8G")
def main(argv=None): workflow_options = [] if "--local" in argv: workflow_options.append("--local") workflow_options.append("-p {}".format( P.get_params()["cluster"]["num_jobs"])) P.get_params()["workflow_options"] == "".join(workflow_options) # manually set location of test scripts - this needs to be better organized # 1. make scripts live alongside pipeline_testing.py # 2. make scripts available via cgatflow CLI # 3. include scripts in pipeline_testing P.get_params()["test_scriptsdir"] = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "scripts") P.main(argv)
def connect(): ''' Setup a connection to an sqlite database ''' dbh = sqlite3.connect(P.get_params()['database']) return dbh
def runBioProspector(infiles, outfile, dbhandle): '''run bioprospector for motif discovery. Bioprospector is run on only the top 10% of peaks. ''' # bioprospector currently not working on the nodes to_cluster = False # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" tmpfasta = P.get_temp_filename(".") track = outfile[:-len(".bioprospector")] nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=True, masker="dust", proportion=P.get_params()["bioprospector_proportion"]) if nseq == 0: E.warn("%s: no sequences - bioprospector skipped" % track) iotools.touch_file(outfile) else: statement = ''' BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log ''' P.run(statement) os.unlink(tmpfasta)
def loadBioProspector(infile, outfile): '''load results from bioprospector.''' target_path = os.path.join( os.path.abspath(P.get_params()["exportdir"]), "bioprospector") try: os.makedirs(target_path) except OSError: pass track = infile[:-len(".bioprospector")] results = Bioprospector.parse(iotools.open_file(infile, "r")) tmpfile = P.get_temp_file() tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n") for x, motifs in enumerate(results): outname = os.path.join(target_path, "%s_%02i.png" % (track, x)) Bioprospector.build_logo([y.sequence for y in motifs.matches], outname) for match in motifs.matches: distance = abs( match.start + match.width1 - (match.end - match.width2)) if match.strand in ("+-", "-+"): arrangement = "ER" elif match.strand in ("++", "--"): arrangement = "DR" else: arrangement = "SM" distance = 0 arrangement += "%i" % distance strand = match.strand[0] id = re.sub(".*_", "", match.id) tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" % (id, x, match.start, match.end, strand, arrangement)) tmpfile.close() P.load(tmpfile.name, outfile, options="--add-index=id " "--add-index=motif " "--add-index=id,motif " "--allow-empty-file " "--map=base_qualities:text") os.unlink(tmpfile.name)
def runFastQC(infiles, outfile): '''run FastQC on each input file. convert sra files to fastq and check mapping qualities are in solexa format. Perform quality control checks on reads from .fastq files. ''' # only pass the contaminants file list if requested by user, if P.get_params()['use_custom_contaminants']: m = mapping.FastQC(nogroup=P.get_params()["readqc_no_group"], outdir=os.path.dirname(outfile), contaminants=P.get_params()['contaminants_path'], qual_format=P.get_params()['qual_format']) else: m = mapping.FastQC(nogroup=P.get_params()["readqc_no_group"], outdir=os.path.dirname(outfile), qual_format=P.get_params()['qual_format']) if P.get_params()["reconcile"] == 1: infiles = infiles.replace("processed.dir/trimmed", "reconciled.dir/trimmed") statement = m.build((infiles,), outfile) P.run(statement)
def makeAdaptorFasta(infile, outfile): '''Make a single fasta file for each sample of all contaminant adaptor sequences for removal ''' preprocess.makeAdaptorFasta( infile=infile, outfile=outfile, track=re.match(REGEX_TRACK, infile).groups()[0], dbh=connect(), contaminants_file=P.get_params()['contaminants_path'])
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' # job_options = "-l mem_free=8000M" target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) tmpdir = P.get_temp_dir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.as_list(P.get_params()['motifs_masker']), halfwidth=int(P.get_params()["meme_halfwidth"]), maxsize=int(P.get_params()["meme_max_size"]), proportion=P.get_params()["meme_proportion"], min_sequences=P.get_params()["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) iotools.touch_file(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)
def runFastqScreen(infiles, outfile): '''run FastqScreen on input files.''' # configure job_threads with fastq_screen_options from P.get_params() job_threads = re.findall(r'--threads \d+', P.get_params()['fastq_screen_options']) if len(job_threads) != 1: raise ValueError("Wrong number of threads for fastq_screen") job_threads = int(re.sub(r'--threads ', '', job_threads[0])) tempdir = P.get_temp_dir(".") conf_fn = os.path.join(tempdir, "fastq_screen.conf") with iotools.open_file(conf_fn, "w") as f: for i, k in P.get_params().items(): if i.startswith("fastq_screen_database"): f.write("DATABASE\t%s\t%s\n" % (i[22:], k)) m = mapping.FastqScreen(config_filename=conf_fn) statement = m.build((infiles,), outfile) P.run(statement, job_memory="8G") shutil.rmtree(tempdir) iotools.touch_file(outfile)
def exportSequencesFromBedFile(infile, outfile, masker=None, mode="intervals"): '''export sequences for intervals in :term:`bed`-formatted *infile* to :term:`fasta` formatted *outfile* ''' track = P.snip(infile, ".bed.gz") fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) outs = iotools.open_file(outfile, "w") ids, seqs = [], [] for bed in Bed.setName(Bed.iterator(iotools.open_file(infile))): lcontig = fasta.getLength(bed.contig) if mode == "intervals": seqs.append(fasta.getSequence(bed.contig, "+", bed.start, bed.end)) ids.append("%s_%s %s:%i..%i" % (track, bed.name, bed.contig, bed.start, bed.end)) elif mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_%s_l %s:%i..%i" % (track, bed.name, bed.contig, start, end)) seqs.append(fasta.getSequence(bed.contig, "+", start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_%s_r %s:%i..%i" % (track, bed.name, bed.contig, start, end)) seqs.append(fasta.getSequence(bed.contig, "+", start, end)) masked = maskSequences(seqs, masker) outs.write("\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)])) outs.close()
def loadFastQC(infile, outfile): '''load FASTQC stats into database.''' # a check to make sure file isnt empty n = 0 with iotools.open_file(infile) as f: for i, line in enumerate(f): n =+ i if n > 0: P.load(infile, outfile, options="--add-index=track") else: table_name = infile.replace(".tsv.gz", "") database_sql = P.get_params()["database"]["url"] database_name = os.path.basename(database_sql) statement = """sqlite3 %(database_name)s 'DROP TABLE IF EXISTS %(table_name)s; CREATE TABLE %(table_name)s ("track" text PRIMARY KEY, "Sequence" text, "Count" integer, "Percentage" integer, "Possible Source" text);' 'INSERT INTO %(table_name)s VALUES ("NA", "NA", 0, 0, "NA");'""" P.run(statement)
def runMEMEOnSequences(infile, outfile): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker ''' # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs == 0: E.warn("%s: no sequences - meme skipped" % outfile) iotools.touch_file(outfile) return target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "meme", outfile) tmpdir = P.get_temp_dir(".") statement = ''' meme %(infile)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(motifs_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)
def main(argv): options = P.initialize(argv, config_file="benchmark.yml") # compatibility with cgatcore < 0.6.3 if isinstance(options, tuple): options = options[0] # not sure what this does # if not options.config_file: # P.get_parameters(options.config_file) # else: # sys.exit(P.main(options, args)) params = P.get_params() with arvados_enabled(always_mount=options.always_mount): mountpoint = params.get("mount_point", None) if mountpoint: redirect_defaults2mountpoint(mountpoint) # A selection of command line arguments are added to PARAMS # as 'extras' not implemented in ruffus 2.6.3 kwargs = collections.defaultdict(dict) if options.only_info: kwargs["extras"].update({'only_info': True}) P.PARAMS["only_info"] = True if options.is_test: kwargs["extras"].update({'is_test': True}) P.PARAMS["is_test"] = True E.debug("construction of workflow started") pipeline = ruffus.Pipeline('benchmark') # Tool execution suffix, tool_runners = add_tools_to_pipeline(pipeline, map_tool_to_runner, config=P.PARAMS, **kwargs) E.debug("added {} tools to workflow".format(len(tool_runners))) # Optionally, add externally computed files as # pseudo-tools: if "external" in P.PARAMS["setup"]: external_runners = add_external_data_to_pipeline(pipeline, config=P.PARAMS, **kwargs) tool_runners.extend(external_runners) # Optionally, combine tool runs into aggregate # outputs. The type of the output is preserved # (VCF -> VCF, etc.) # For example, call individual members in a trio # and then build a combined VCF to analyse mendelian # inconsistencies. if "collate" in P.PARAMS["setup"]: collate_runners = add_collations_to_pipeline( pipeline, map_collate_to_runner, P.PARAMS["setup"]["collate"], tasks=tool_runners, config=P.PARAMS) if P.PARAMS["setup"].get("only_collate", False): tool_runners = [] if P.PARAMS["setup"].get("no_collate_metrics", False): collate_runners = [] E.debug("added {} collators to workflow".format( len(collate_runners))) else: collate_runners = [] # Optionally, split up the output before applying # additional analyses. The type of the output is preserved # (VCF -> VCF, etc). # For example, identify false positives, false negatives # and true positives and collect metrics individually. if "split" in P.PARAMS["setup"]: split_runners = add_splits_to_pipeline(pipeline, map_split_to_runner, tool_runners, P.PARAMS["setup"]["split"], tasks=tool_runners, config=P.PARAMS) if P.PARAMS["setup"].get("only_split", False): tool_runners = [] E.debug("added {} splitters to workflow".format( len(split_runners))) else: split_runners = [] metric_runners = [] for prefix, r in zip(["tool", "collate", "split"], [tool_runners, collate_runners, split_runners]): if not r: continue metrics = None if prefix == "collate" and "collate_metrics" in P.PARAMS["setup"]: metrics = P.PARAMS["setup"]["collate_metrics"] elif prefix == "split" and "split_metrics" in P.PARAMS["setup"]: metrics = P.PARAMS["setup"]["split_metrics"] elif "metrics" in P.PARAMS["setup"]: metrics = P.PARAMS["setup"]["metrics"] else: raise KeyError( "configuration file requires a 'setup:metrics' section") # Metric execution mm = add_metrics_to_pipeline(pipeline, metrics, map_metric_to_runner, r, suffix=suffix, prefix=prefix + "_", config=P.PARAMS, **kwargs) if len(mm) == 0: raise ValueError( "workflow construction error: " "no metric tasks result for metrics {}".format(metrics)) metric_runners.extend(mm) E.debug("added {} {}_metrics to workflow".format(len(mm), prefix)) # add plot task if "aggregate" in P.PARAMS["setup"]: aggregate_metrics = add_collations_to_pipeline( pipeline, map_collate_to_runner, P.PARAMS["setup"]["aggregate"], metric_runners, config=P.PARAMS) E.debug("added metric aggregation to workflow") else: aggregate_metrics = [] add_upload_to_pipeline(pipeline, metric_runners + aggregate_metrics, P.PARAMS) E.debug("added upload to workflow".format(prefix)) # add export task export = P.PARAMS["setup"].get("export", ["tools", "collate", "split"]) map_export2runner = { "collate": collate_runners, "tools": tool_runners, "split": split_runners } export_runners = [] for e in export: try: export_runners.extend(map_export2runner[e]) except KeyError: raise KeyError("unknown export section: {}".format(e)) add_export_to_pipeline(pipeline, export_runners, suffix=suffix, config=P.PARAMS) E.debug("added export to workflow") add_all_task_to_pipeline(pipeline, metric_runners + aggregate_metrics) # Collate output files to facilitate analysis if "collation" in P.PARAMS: collators = add_collations_to_pipeline(pipeline, map_collate_to_runner, P.PARAMS["collation"], config=P.PARAMS) E.debug("construction of workflow completed") E.debug("starting workflow") P.run_workflow(options, pipeline=pipeline)
def writeSequencesForIntervals(track, filename, dbhandle, full=False, halfwidth=None, maxsize=None, proportion=None, masker=[], offset=0, shuffled=False, num_sequences=None, min_sequences=None, order="peakval", shift=None): '''build a sequence set for motif discovery. Intervals are taken from the table <track>_intervals in the database *dbhandle* and save to *filename* in :term:`fasta` format. If num_shuffles is set, shuffled copies are created as well with the shuffled number appended to the filename. The sequences are masked before shuffling (is this appropriate?) If *full* is set, the whole intervals will be output, otherwise only the region around the peak given by *halfwidth* If *maxsize* is set, the output is truncated at *maxsize* characters in order to create jobs that take too long. If proportion is set, only the top *proportion* intervals are output (sorted by peakval). If *num_sequences* is set, the first *num_sequences* will be used. *masker* can be a combination of * dust, dustmasker: apply dustmasker * softmask: mask softmasked genomic regions *order* is the order by which peaks should be sorted. Possible values are 'peakval' (peak value, descending order), 'score' (peak score, descending order) If *shift* is set, intervals will be shifted. ``leftright`` creates two intervals on the left and right of the actual interval. The intervals will be centered around the mid-point and truncated the same way as the main intervals. ''' fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) if order == "peakval": orderby = " ORDER BY peakval DESC" elif order == "max": orderby = " ORDER BY score DESC" else: raise ValueError( "Unknown value passed as order parameter, check your ini file") tablename = "%s_intervals" % P.tablequote(track) statement = '''SELECT contig, start, end, interval_id, peakcenter FROM %(tablename)s ''' % locals() + orderby cc = dbhandle.execute(statement) data = cc.fetchall() cc.close() if proportion: cutoff = int(len(data) * proportion) + 1 if min_sequences: cutoff = max(cutoff, min_sequences) elif num_sequences: cutoff = num_sequences else: cutoff = len(data) L.info( "writeSequencesForIntervals %s: using at most %i sequences for pattern finding" % (track, cutoff)) data = data[:cutoff] L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker))) fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) # modify the ranges if shift: if shift == "leftright": new_data = [(contig, start - (end - start), start, str(interval_id) + "_left", peakcenter) for contig, start, end, interval_id, peakcenter in data ] new_data.extend([ (contig, end, end + (end - start), str(interval_id) + "_right", peakcenter) for contig, start, end, interval_id, peakcenter in data ]) data = new_data if halfwidth: # center around peakcenter, add halfwidth on either side data = [(contig, peakcenter - halfwidth, peakcenter + halfwidth, interval_id) for contig, start, end, interval_id, peakcenter in data] else: # remove peakcenter data = [(contig, start, end, interval_id) for contig, start, end, interval_id, peakcenter in data] # get the sequences - cut at number of nucleotides sequences = [] current_size, nseq = 0, 0 new_data = [] for contig, start, end, interval_id in data: lcontig = fasta.getLength(contig) start, end = max(0, start + offset), min(end + offset, lcontig) if start >= end: L.info( "writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored" % (track, id, start, end, offset)) continue seq = fasta.getSequence(contig, "+", start, end) sequences.append(seq) new_data.append((start, end, interval_id, contig)) current_size += len(seq) if maxsize and current_size >= maxsize: L.info( "writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)" % (track, maxsize, nseq, len(data) - nseq)) break nseq += 1 data = new_data if shuffled: # note that shuffling is done on the unmasked sequences # Otherwise N's would be interspersed with real sequence # messing up motif finding unfairly. Instead, masking is # done on the shuffled sequence. sequences = [list(x) for x in sequences] for sequence in sequences: random.shuffle(sequence) sequences = maskSequences(["".join(x) for x in sequences], masker) c = E.Counter() outs = iotools.open_file(filename, "w") for masker in masker: if masker not in ("unmasked", "none", None): sequences = maskSequences(sequences, masker) for sequence, d in zip(sequences, data): c.input += 1 if len(sequence) == 0: c.empty += 1 continue start, end, id, contig = d id = "%s_%s %s:%i-%i" % (track, str(id), contig, start, end) outs.write(">%s\n%s\n" % (id, sequence)) c.output += 1 outs.close() E.info("%s" % c) return c.output
def processReads(infile, outfiles): '''process reads from .fastq and other sequence files. ''' trimmomatic_options = P.get_params()["trimmomatic_options"] if P.get_params()["auto_remove"]: trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % ( "contaminants.fasta", P.get_params()["trimmomatic_mismatches"], P.get_params()["trimmomatic_p_thresh"], P.get_params()["trimmomatic_c_thresh"], P.get_params()["trimmomatic_min_adapter_len"], P.get_params()["trimmomatic_keep_both_reads"]) + trimmomatic_options elif P.get_params()["trimmomatic_adapter"]: trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % ( P.get_params()["trimmomatic_adapter"], P.get_params()["trimmomatic_mismatches"], P.get_params()["trimmomatic_p_thresh"], P.get_params()["trimmomatic_c_thresh"], P.get_params()["trimmomatic_min_adapter_len"], P.get_params()["trimmomatic_keep_both_reads"]) + trimmomatic_options job_threads = P.get_params()["threads"] job_memory = "12G" track = re.match(REGEX_TRACK, infile).groups()[0] m = preprocess.MasterProcessor( save=P.get_params()["save"], summarize=P.get_params()["summarize"], threads=P.get_params()["threads"], qual_format=P.get_params()['qual_format']) for tool in P.as_list(P.get_params()["preprocessors"]): if tool == "fastx_trimmer": m.add(preprocess.FastxTrimmer( P.get_params()["fastx_trimmer_options"], threads=P.get_params()["threads"])) elif tool == "trimmomatic": m.add(preprocess.Trimmomatic( trimmomatic_options, threads=P.get_params()["threads"])) elif tool == "sickle": m.add(preprocess.Sickle( P.get_params()["sickle_options"], threads=P.get_params()["threads"])) elif tool == "trimgalore": m.add(preprocess.Trimgalore( P.get_params()["trimgalore_options"], threads=P.get_params()["threads"])) elif tool == "flash": m.add(preprocess.Flash( P.get_params()["flash_options"], threads=P.get_params()["threads"])) elif tool == "reversecomplement": m.add(preprocess.ReverseComplement( P.get_params()["reversecomplement_options"])) elif tool == "pandaseq": m.add(preprocess.Pandaseq( P.get_params()["pandaseq_options"], threads=P.get_params()["threads"])) elif tool == "cutadapt": cutadapt_options = P.get_params()["cutadapt_options"] if P.get_params()["auto_remove"]: cutadapt_options += " -a file:contaminants.fasta " m.add(preprocess.Cutadapt( cutadapt_options, threads=P.get_params()["threads"], untrimmed=P.get_params()['cutadapt_reroute_untrimmed'], process_paired=P.get_params()["cutadapt_process_paired"])) else: raise NotImplementedError("tool '%s' not implemented" % tool) statement = m.build((infile,), "processed.dir/trimmed-", track) P.run(statement)
def loadMAST(infile, outfile): '''parse mast file and load into database. Parse several motif runs and add them to the same table. Add columns for the control data as well. ''' tablename = P.to_table(outfile) tmpfile = P.get_temp_file(".") tmpfile.write(MAST.Match().header + "\tmotif\tcontig" "\tl_evalue\tl_pvalue\tl_nmatches\tl_length\tl_start\tl_end" "\tr_evalue\tr_pvalue\tr_nmatches\tr_length\tr_start\tr_end" "\tmin_evalue\tmin_pvalue\tmax_nmatches" + "\n") lines = iotools.open_file(infile).readlines() chunks = [x for x in range(len(lines)) if lines[x].startswith("::")] chunks.append(len(lines)) def readChunk(lines, chunk): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.get_temp_file(".") try: motif, part = re.match(":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise ValueError("parsing error in line '%s'" % lines[chunks[chunk]]) E.info("reading %s - %s" % (motif, part)) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() mast = MAST.parse(iotools.open_file(tmpfile2.name, "r")) os.unlink(tmpfile2.name) return motif, part, mast def splitId(s, mode): '''split background match id has three parts: track _ id _ pos track might contain '_'. ''' d = match.id.split("_") if mode == "bg": return "_".join(d[:-2]), d[-2], d[-1] elif mode == "fg": return "_".join(d[:-1]), d[-1] for chunk in range(0, len(chunks) - 1, 2): motif_fg, part, mast_fg = readChunk(lines, chunk) assert part == "foreground" motif_bg, part, mast_bg = readChunk(lines, chunk + 1) assert part == "background" assert motif_fg == motif_bg # index control data controls = collections.defaultdict(dict) for match in mast_bg.matches: track, id, pos = splitId(match.id, "bg") controls[id][pos] = (match.evalue, match.pvalue, match.nmotifs, match.length, match.start, match.end) for match in mast_fg.matches: # remove track and pos track, match.id = splitId(match.id, "fg") # move to genomic coordinates contig, start, end = re.match("(\S+):(\d+)..(\d+)", match.description).groups() if match.nmotifs > 0: start, end = int(start), int(end) match.start += start match.end += start match.positions = [x + start for x in match.positions] id = match.id if id not in controls: P.warn("no controls for %s - increase MAST evalue" % id) if "l" not in controls[id]: controls[id]["l"] = (float(P.get_params()["mast_evalue"]), 1, 0, 0, 0, 0) if "r" not in controls[id]: controls[id]["r"] = (float(P.get_params()["mast_evalue"]), 1, 0, 0, 0, 0) min_evalue = min(controls[id]["l"][0], controls[id]["r"][0]) min_pvalue = min(controls[id]["l"][1], controls[id]["r"][1]) max_nmatches = max(controls[id]["l"][2], controls[id]["r"][2]) tmpfile.write( str(match) + "\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( motif_fg, contig, "\t".join(map(str, controls[id]["l"])), "\t".join(map(str, controls[id]["r"])), str(min_evalue), str(min_pvalue), str(max_nmatches), ) + "\n") tmpfile.close() P.load(tmpfile.name, outfile, options="--add-index=id " "--add-index=motif " "--add-index=id,motif " "--allow-empty-file " "--map=base_qualities:text") os.unlink(tmpfile.name)
# unprocessed files REGEX_TRACK_BOTH = r"(processed.dir/)*([^/]+)\.(fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)" SEQUENCEFILES_REGEX = r"([^/]+).(?P<suffix>fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)" def connect(): ''' Setup a connection to an sqlite database ''' dbh = sqlite3.connect(P.get_params()['database']) return dbh @transform(P.get_params()["input_globs"].get("default", INPUT_FORMATS), regex("(.*)"), r"\1") def unprocessReads(infiles, outfiles): """dummy task - no processing of reads.""" # if preprocess tools are specified, preprocessing is done on output that has # already been generated in the first run if P.get_params().get("preprocessors", None): if P.get_params()["auto_remove"]: # check if FastQC has been run for x in iotools.flatten([glob.glob(y) for y in P.get_params()["input_globs"].get("default", INPUT_FORMATS)]): f = "fastqc.dir/" + re.match(REGEX_TRACK, x).group(1) + ".fastqc" if not os.path.exists(f): raise ValueError(
def run_report(clean=True, with_pipeline_status=True, pipeline_status_format="svg"): '''run cgatreport. This will also run ruffus to create an svg image of the pipeline status unless *with_pipeline_status* is set to False. The image will be saved into the export directory. ''' params = P.get_params() if with_pipeline_status: targetdir = params["exportdir"] if not os.path.exists(targetdir): os.mkdir(targetdir) ruffus.pipeline_printout_graph( os.path.join(targetdir, "pipeline.%s" % pipeline_status_format), pipeline_status_format, ["full"], checksum_level=params["ruffus_checksums_level"]) dirname, basename = os.path.split(P.get_caller().__file__) report_engine = params.get("report_engine", "cgatreport") assert report_engine in ('sphinxreport', 'cgatreport') docdir = os.path.join(dirname, "pipeline_docs", iotools.snip(basename, ".py")) themedir = os.path.join(dirname, "pipeline_docs", "themes") relpath = os.path.relpath(docdir) trackerdir = os.path.join(docdir, "trackers") # use a fake X display in order to avoid windows popping up # from R plots. xvfb_command = iotools.which("xvfb-run") # permit multiple servers using -d option if xvfb_command: xvfb_command += " -d " else: xvfb_command = "" # if there is no DISPLAY variable set, xvfb runs, but # exits with error when killing process. Thus, ignore return # value. # print os.getenv("DISPLAY"), "command=", xvfb_command if not os.getenv("DISPLAY"): erase_return = "|| true" else: erase_return = "" if os.path.exists("conf.py"): conf_dir = os.path.abspath(".") else: conf_dir = os.path.join(os.path.dirname(__file__), "configuration") # in the current version, xvfb always returns with an error, thus # ignore these. erase_return = "|| true" if clean: clean = "rm -rf report _cache _static;" else: clean = "" # with sphinx >1.3.1 the PYTHONPATH needs to be set explicitely as # the virtual environment seems to be stripped. It is thus set to # the contents of the current sys.path syspath = ":".join(sys.path) statement = ''' %(clean)s (export SPHINX_DOCSDIR=%(docdir)s; export SPHINX_THEMEDIR=%(themedir)s; export PYTHONPATH=%(syspath)s; %(xvfb_command)s %(report_engine)s-build --num-jobs=%(report_threads)s sphinx-build -b html -d %(report_doctrees)s -c %(conf_dir)s -j %(report_threads)s %(docdir)s %(report_html)s >& report.log %(erase_return)s ) ''' P.run(statement) E.info( 'the report is available at %s' % os.path.abspath(os.path.join(params['report_html'], "contents.html")))