def summarizeFastQC(infiles, outfiles): all_files = [] for infile in infiles: track = P.snip(infile, ".fastqc") all_files.extend(glob.glob( os.path.join(track + "*_fastqc", "fastqc_data.txt"))) dfs = PipelineReadqc.read_fastqc( all_files) for key, df in dfs.items(): fn = re.sub("basic_statistics", key, outfiles[0]) E.info("writing to {}".format(fn)) with IOTools.open_file(fn, "w") as outf: df.to_csv(outf, sep="\t", index=True)
def removeObservationsPerc(self, percentile_rowsums=10): '''remove Observations (e.g genes) * remove the lowest percentile of rows in the table, sorted by total tags per row ''' # percentile filtering percentile = float(percentile_rowsums) / 100.0 sum_counts = self.table.sum(1) take = sum_counts >= sum_counts.quantile(percentile) E.info("percentile filtering at level %f: keep=%i, discard=%i" % (percentile_rowsums, sum(take), len(take) - sum(take))) self.table = self.table[take]
def buildMisprimingLib(infiles, outfile): ''' build fasta file of sequences to check for mispriming ''' fasta, identifiers = infiles inf = IOTools.open_file(fasta) E.info("reading ids for sequences to keep") ids = readIdentifiers(identifiers) outf = IOTools.open_file(outfile, "w") E.info("collecting sequences") for f in FastaIterator.iterate(IOTools.open_file(fasta)): if f.title not in ids: outf.write(">%s\n%s\n" % (f.title, f.sequence)) outf.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.set_defaults() # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) == 0 or (len(args) == 1 and args[0] == "-"): infile = options.stdin else: infile = fileinput.FileInput(args) # do sth ninput, nskipped, noutput = 0, 0, 0 header = False for line in infile: ninput += 1 if line.startswith("#"): pass elif not header: header = line elif line == header: nskipped += 1 continue options.stdout.write(line) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.stop()
def run(self, *args, **PARAMS): # Custom command to run reference matching tool. statement, run_options = self.buildStatement(**PARAMS) # Logging runfiles = '\t'.join([os.path.basename(x) for x in (self.fastn1, \ self.fastn2, \ self.fastn3) if x]) E.info("Running sortMeRNA for files: {}".format(runfiles)) P.run(statement, job_options=run_options) # Post process results into generic output for downstream tasks. statement = self.postProcess(**PARAMS) if statement: P.run(statement, run_options)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.set_defaults() # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, unknowns=True) if len(unknown) == 0 or (len(unknown) == 1 and unknown[0] == "-"): infile = args.stdin else: infile = fileinput.FileInput(args) # do sth ninput, nskipped, noutput = 0, 0, 0 header = False for line in infile: ninput += 1 if line.startswith("#"): pass elif not header: header = line elif line == header: nskipped += 1 continue args.stdout.write(line) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.stop()
def estimateExpression(infiles, outfile): '''estimate expression levels.''' R.library("affy") E.info("reading data") raw_data = R.ReadAffy(infiles) E.info("RMA normalization") eset = R.rma(raw_data) R.boxplot(raw_data) R.boxplot(eset) print(R.as_list(R.assayData(eset)))
def segmentFixedWidthWindows(infile, window_size, window_shift): """return a list of fixed contig sizes.""" ninput, nskipped, noutput = 0, 0, 0 iterator = FastaIterator.FastaIterator(infile) window_shift = window_size # at most 50% can be gap gap_cutoff = int(window_size // 2) segments = [] while 1: ninput += 1 try: cur_record = next(iterator) except StopIteration: break if cur_record is None: break contig = re.sub("\s.*", "", cur_record.title) seq = cur_record.sequence size = len(cur_record.sequence) for x in range(0, size, window_shift): s = seq[x:x + window_size].upper() gc, at = 0, 0 for c in s: if c in "GC": gc += 1 elif c in "AT": at += 1 # skip segments containing mostly gaps if window_size - (gc + at) > gap_cutoff: nskipped += 1 continue segments.append( (contig, x, x + window_size, float(gc) / (gc + at))) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped_windows=%i" % (ninput, noutput, nskipped)) return segments
def make_report(): ''' Generates html and pdf versions of restructuredText files using sphinx-quickstart pre-configured files (conf.py and Makefile). Pre-configured files need to be in a pre-existing report directory. Existing reports are overwritten. ''' report_path = os.path.abspath( os.path.join(os.path.dirname(__file__), 'pipeline_report')) print('Copying report templates from: {}'.format(report_path)) if (os.path.exists(report_dir) and os.path.isdir(report_dir) and not os.listdir(report_dir)): statement = '''cp %(report_path)s/* pipeline_report ; cd {} ; ln -s ../pipeline.yml . ; make html ; ln -sf _build/html/report_pipeline_pq_example.html . ; make latexpdf ; ln -sf _build/latex/pq_example.pdf . '''.format(report_dir) E.info('''Building pdf and html versions of your rst files in {}.'''.format(report_dir)) P.run(statement) elif (os.path.exists(report_dir) and os.path.isdir(report_dir) and os.listdir(report_dir)): sys.exit(''' {} exists, not overwriting. You can manually run: cd {} ; ln -s ../pipeline.yml . ; make html ; ln -sf _build/html/report_XXXX.html . ; make latexpdf ; ln -sf _build/latex/XXXX.pdf . Or delete the folder and re-run make_report '''.format(report_dir)) else: sys.exit(''' The directory "pipeline_report" does not exist. Are the paths correct? Template files were tried to be copied from: {} You can also manually copy files and run "make html" or "make latexpdf". '''.format(report_path)) return
def renameChromosomes(iterator, chr_map): ninput, noutput, nskipped = 0, 0, 0 for bed in iterator: ninput += 1 if bed.contig in chr_map.keys(): bed.contig = chr_map[bed.contig] else: nskipped += 1 continue noutput += 1 yield bed E.info("ninput = %i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))
def runCPC(infile, outfile): ''' run coding potential calculations on lncRNA geneset ''' # farm.py is called from within cpc.sh assert iotools.which("farm.py"), \ "farm.py needs to be in $PATH for cpc to run" # Default cpc parameters don't work with later versions of blast E.info("Running cpc with blast version:%s" % iotools.which("blastx")) result_evidence = P.snip(outfile, ".result") + ".evidence" working_dir = "cpc" statement = ("%(pipeline_scriptsdir)s/cpc.sh" " %(infile)s" " %(outfile)s" " %(working_dir)s" " %(result_evidence)s") P.run()
def make_mapped_matrix(map_dict, input_frame): ''' return a matrix with integer labels from mapping ''' frame_index = input_frame.index.tolist() nindex = len(frame_index) ncols = len(input_frame.columns) integer_matrix = np.ndarray((nindex, ncols), dtype=np.int32) E.info("mapping cluster labels") matrix_idx = [h for h, g in enumerate(frame_index)] for idx in matrix_idx: for col in range(ncols): mod = input_frame.iloc[idx][col + 1] integer_matrix[idx][col] = map_dict[mod] return integer_matrix
def read_and_randomize_rows(infile, args): """read table from stdin and randomize rows, keeping header.""" c = E.Counter() if args.has_headers: keep_header = 1 else: keep_header = 0 for x in range(keep_header): c.header += 1 args.stdout.write(infile.readline()) lines = infile.readlines() c.lines_input = len(lines) random.shuffle(lines) args.stdout.write("".join(lines)) c.lines_output = len(lines) E.info(c)
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--fastq1", dest="fastq1") parser.add_option("--to-drop-single", dest='to_remove_singletons') parser.add_option("--fastq-out1", dest="fq_out1") parser.add_option("--fastq-drop1", dest="fq_dropped1") (options, args) = E.start(parser) reads_to_remove = IOTools.open_file( options.to_remove_singletons).readlines() reads_to_remove = set([x.strip() for x in reads_to_remove]) fastq_out = IOTools.open_file(options.fq_out1, 'w') fastq_host = IOTools.open_file(options.fq_dropped1, 'w') reads = 0 dropped_reads = 0 for read in Fastq.iterate(IOTools.open_file(fastq1)): reads += 1 if read.identifier.split()[0] in reads_to_remove: fastq_host.write("@%s\n%s\n+\n%s\n" % (read.identifier, read.seq, read.quals)) dropped_reads += 1 else: fastq_out.write("@%s\n%s\n+\n%s\n" % (read.identifier, read.seq, read.quals)) fastq_out.close() fastq_host.close() try: percent_dropped = dropped_reads / float(reads) * 100 except ZeroDivisionError: percent_dropped = 0.0 E.info('Dropped %i of %i reads (%f percent)' \ % (dropped_reads, reads, percent_dropped))
def renameChromosomes(gffs, chr_map): ninput, noutput, nskipped = 0, 0, 0 for gff in gffs: ninput += 1 if gff.contig in chr_map.keys(): gff.contig = chr_map[gff.contig] else: nskipped += 1 continue noutput += 1 yield gff E.info("ninput = %i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))
def shiftIntervals(iterator, contigs, offset): """shift intervals by a certain offset and ensure size is maintaned even id contig end reached. contigs is a dictionary of contig sizes.""" ninput, noutput = 0, 0 nskipped_contig, nskipped_range = 0, 0 for bed in iterator: ninput += 1 if bed.contig not in contigs: nskipped_contig += 1 continue # IMS: if we skip intervals off the end of the contig we should skipp ones # off the start as well if bed.start < 0 or bed.end < 0: nskipped_range += 1 continue # IMS: changing >= to > as bed is half-open if bed.end > contigs[bed.contig]: nskipped_range += 1 continue noutput += 1 # add offset to each start and end, and adjust for contig length l = bed.end - bed.start newstart = bed.start + offset newend = bed.end + offset if newstart < 0: newstart = 0 newend = l if newend > contigs[bed.contig]: newstart = contigs[bed.contig] - l newend = contigs[bed.contig] bed.start = newstart bed.end = newend yield bed E.info("ninput=%i, noutput=%i, nskipped_contig=%i, nskipped_range=%i" % (ninput, noutput, nskipped_contig, nskipped_range))
def mergeBAMFiles(infiles, outfile): '''merge BAM files from the same experiment using user-defined regex For the mapping stages it is beneficial to perform mapping seperately for each sequence read infile(s) per sample so that the consistency can be checked. However, for downstream tasks, the merged :term:`bam` alignment files are required. Parameters ---------- infiles : list list of :term:`bam` format alignment files outfile : str Output filename in :term:`bam` format ''' if "merge_pattern_output" not in PARAMS or \ not PARAMS["merge_pattern_output"]: raise ValueError("no output pattern 'merge_pattern_output' specified") if len(infiles) == 1: if not os.path.isfile(os.path.join(infiles[0], outfile)): E.info("%(outfile)s: only one file for merging - creating " "softlink" % locals()) os.symlink(os.path.basename(infiles[0]), outfile) os.symlink(os.path.basename(infiles[0]) + ".bai", outfile + ".bai") return else: E.info("%(outfile)s: only one file for merging - softlink " "already exists" % locals()) return infiles = " ".join(infiles) tmp_bam = P.get_temp_filename(".") statement = ''' samtools merge %(tmp_bam)s %(infiles)s >& %(outfile)s_merge.log && samtools sort %(tmp_bam)s -o %(outfile)s && samtools index %(outfile)s ''' job_memory = '20G' P.run(statement)
def runIDROnPooledPseudoreplicates(infiles, outfile): """ Run IDR analysis on pooled pseudoreplicates for each EXPERIMENT """ # set IDR parameters chr_table = os.path.join(PARAMS["annotations_dir"], PARAMS["annotations_interface_contigs"]) # get statement statement = IDR.getIDRStatement(infiles[0], infiles[1], outfile, PARAMS["idr_options_overlap_ratio"], PARAMS["idr_options_ranking_measure"], chr_table) # run E.info("applyIDR: processing %s and %s" % (infiles[0], infiles[1])) job_memory = "5G" P.run()
def count(self, filename1, filename2): """count overlap between two bed files.""" E.info("counting started for %s versus %s" % (filename1, filename2)) idx2 = self.buildIndex(filename2) (self.mExons1, self.mExonsOverlapping1, self.mBases1, self.mBasesOverlapping1) = self._count(filename1, idx2) self.mExonsUnique1 = self.mExons1 - self.mExonsOverlapping1 self.mBasesUnique1 = self.mBases1 - self.mBasesOverlapping1 idx1 = self.buildIndex(filename1) (self.mExons2, self.mExonsOverlapping2, self.mBases2, self.mBasesOverlapping2) = self._count(filename2, idx1) self.mExonsUnique2 = self.mExons2 - self.mExonsOverlapping2 self.mBasesUnique2 = self.mBases2 - self.mBasesOverlapping2
def count(self, filename, track): """count overlap between two gtf files.""" E.info("counting started for %s versus %s" % (filename, track)) (self.mExons1, self.mExonsOverlapping1, self.mBases1, self.mBasesOverlapping1) = self._count(filename, self.mIndices[track]) self.mExonsUnique1 = self.mExons1 - self.mExonsOverlapping1 self.mBasesUnique1 = self.mBases1 - self.mBasesOverlapping1 idx = self.buildIndex(filename) # count index against index (self.mExons2, self.mExonsOverlapping2, self.mBases2, self.mBasesOverlapping2) = self._countIndices(self.mIndices[track], idx) self.mExonsUnique2 = self.mExons2 - self.mExonsOverlapping2 self.mBasesUnique2 = self.mBases2 - self.mBasesOverlapping2
def getTables(dbname): ''' Retrieves the names of all tables in the database. Groups tables into dictionaries by annotation ''' dbh = sqlite3.connect(dbname) c = dbh.cursor() statement = "SELECT name FROM sqlite_master WHERE type='table'" c.execute(statement) tables = c.fetchall() c.close() dbh.close() D = {} for t in tables: tname = t[0].replace("ensemblg2", "").split("$") E.info(tname) ttype = tname[0] D.setdefault(ttype, []) D[ttype].append(tname[1]) return D
def __init__(self, filename, *args, **kwargs): assert filename is not None,\ "please supply filename for CounterOverlap" Counter.__init__(self, *args, **kwargs) self.filename = filename E.info("reading intervals from %s" % self.filename) self.index = Bed.readAndIndex(iotools.open_file(self.filename, "r"), per_track=True) E.info("read intervals for %s tracks" % len(self.index)) self.tracks = list(self.index.keys()) self.headers = [] for track in self.tracks: self.headers.extend(["%s_nover" % track, "%s_bases" % track])
def buildGenomeAlignment(infiles, outfile): '''build pairwise genomic aligment from axt files.''' try: os.remove(outfile) except OSError: pass for infile in infiles: E.info("adding %s" % infile) statement = '''gunzip < %(infile)s | axtToPsl /dev/stdin %(query)s.sizes %(target)s.sizes /dev/stdout | pslSwap /dev/stdin /dev/stdout | gzip >> %(outfile)s ''' P.run()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-k", "--keep-header", dest="keep_header", type="int", help="randomize, but keep header in place [%default]") parser.set_defaults(keep_header=0) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) inf = options.stdin outf = options.stdout c = E.Counter() for x in range(options.keep_header): c.header += 1 outf.write(inf.readline()) lines = inf.readlines() c.lines_input = len(lines) random.shuffle(lines) for line in lines: outf.write(line) c.lines_output = len(lines) E.info(c) # write footer and output benchmark information. E.stop()
def loadBAMStats(infiles, outfile): '''Import bam statistics into SQLite''' scriptsdir = PARAMS["general_scriptsdir"] header = ",".join( [P.snip(os.path.basename(x), ".readstats") for x in infiles]) filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles]) tablename = P.toTable(outfile) E.info("loading bam stats - summary") statement = """cgat combine_tables --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | perl -p -e "s/unique/unique_alignments/" | cgat table2table --transpose | cgat csv2db --allow-empty-file --add-index=track --table=%(tablename)s > %(outfile)s""" P.run() for suffix in ("nm", "nh"): E.info("loading bam stats - %s" % suffix) filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) tname = "%s_%s" % (tablename, suffix) statement = """cgat combine_tables --header-names=%(header)s --skip-titles --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/%(suffix)s/" | cgat csv2db --table=%(tname)s --allow-empty-file >> %(outfile)s """ P.run()
def summarizeReadCounts(infiles, outfile): '''Calculate the number of reads lost at each step for each sample''' with IOTools.open_file(outfile, 'w') as outf: outf.write("sample_id\tinput_reads\toutput_reads\tduplicates\t" "adapter_contamination\trRNA\thost\tlow_complexity\t" "duplicates_percent\tadapters_percent\trrna_percent\t" "host_percent\tlow_complexity_perc\tremaining_percent\n") for infile in infiles: sample_id = P.snip(os.path.basename(infile), '_read_count_summary.tsv') E.info('Processing sample %s' % sample_id) df = pd.read_table(infile, index_col=0, header=None) deadapt = df.loc['deadapt', 1] deduped = df.loc['deduped', 1] rrna = df.loc['rRNAremoved', 1] dehost = df.loc['dehost', 1] masked = df.loc['masked', 1] input_reads = df.loc['input', 1] lost_dup = input_reads - deduped lost_adapt = deduped - deadapt lost_rrna = deadapt - rrna lost_host = rrna - dehost lost_mask = dehost - masked lost_dup_perc = round(lost_dup / float(input_reads) * 100, 2) lost_adapt_perc = round(lost_adapt / float(input_reads) * 100, 2) lost_rrna_perc = round(lost_rrna / float(input_reads) * 100, 2) lost_host_perc = round(lost_host / float(input_reads) * 100, 2) lost_mask_perc = round(lost_mask / float(input_reads) * 100, 2) output_perc = round(masked / float(input_reads) * 100, 2) outf.write('\t'.join( map(str, [ sample_id, input_reads, masked, lost_dup, lost_adapt, lost_rrna, lost_host, lost_mask, lost_dup_perc, lost_adapt_perc, lost_rrna_perc, lost_host_perc, lost_mask_perc, output_perc ])) + '\n')
def extractControllLncRNAFastaAlignments(infiles, outfile): bed_file, maf_file = infiles maf_tmp = P.getTempFilename("/ifs/scratch") to_cluster = False statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s") P.run() target_genome = PARAMS["genome"] query_genome = PARAMS["phyloCSF_query_genome"] genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"]) gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file, maf_tmp, genome_file, outfile, target_genome, query_genome, keep_gaps=False) E.info("%i gene_models extracted" % gene_models) os.unlink(maf_tmp)
def readChunk(lines, chunk): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.get_temp_file(".") try: motif, part = re.match(":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise ValueError("parsing error in line '%s'" % lines[chunks[chunk]]) E.info("reading %s - %s" % (motif, part)) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() mast = MAST.parse(iotools.open_file(tmpfile2.name, "r")) os.unlink(tmpfile2.name) return motif, part, mast
def write_config_files(pipeline_path, general_path): '''create default configuration files in `path`. ''' paths = [pipeline_path, general_path] config_files = ['pipeline.yml'] for dest in config_files: if os.path.exists(dest): E.warn("file `%s` already exists - skipped" % dest) continue for path in paths: src = os.path.join(path, dest) if os.path.exists(src): shutil.copyfile(src, dest) E.info("created new configuration file `%s` " % dest) break else: raise ValueError("default config file `%s` not found in %s" % (config_files, paths))
def annotate(infile, outfile, geneset): ''' annotate NOGs into functional categories ''' annotation = {} E.info("loading geneset") anno = iotools.openFile(geneset) for line in anno.readlines(): data = line[:-1].split("\t") nog, funccat = data[1], data[3] annotation[nog] = funccat E.info("finished loading gene set") E.info("annotating infile") inf = iotools.openFile(infile) header = inf.readline() outf = iotools.openFile(outfile, "w") outf.write(header[:-1] + "\ttaxa\n") for line in inf.readlines(): data = line[:-1].split("\t") nog = data[0] try: pathway = annotation[nog] except KeyError: pathway = "Function unknown" outf.write(line[:-1] + "\t" + pathway + "\n") outf.close()