def chunk_iterator_regex_split(infile, args, prefix, use_header=False): """split where regular expression is true. """ rex = args[0] chunk_size = args[2] max_lines = args[3] nlines = 0 n = 0 filename = "%s/%010i.in" % (prefix, n) outfile = iotools.openFile(filename, "w") for line in infile: if line[0] == "#": continue if rex.search(line[:-1]): if n > 0 and (n % chunk_size == 0 or (max_lines and nlines > max_lines)): outfile.close() yield filename filename = "%s/%010i.in" % (prefix, n) outfile = iotools.openFile(filename, "w") nlines = 0 n += 1 outfile.write(line) nlines += 1 outfile.close() yield filename
def annotate(infile, outfile, geneset): ''' annotate NOGs into functional categories ''' annotation = {} E.info("loading geneset") anno = iotools.openFile(geneset) for line in anno.readlines(): data = line[:-1].split("\t") nog, funccat = data[1], data[3] annotation[nog] = funccat E.info("finished loading gene set") E.info("annotating infile") inf = iotools.openFile(infile) header = inf.readline() outf = iotools.openFile(outfile, "w") outf.write(header[:-1] + "\ttaxa\n") for line in inf.readlines(): data = line[:-1].split("\t") nog = data[0] try: pathway = annotation[nog] except KeyError: pathway = "Function unknown" outf.write(line[:-1] + "\t" + pathway + "\n") outf.close()
def chunk_iterator_lines(infile, args, prefix, use_header=False): """split by lines.""" chunk_size = args[0] n = 0 filename = "%s/%010i.in" % (prefix, n) outfile = iotools.openFile(filename, "w") header = None for line in infile: if line[0] == "#": continue if not header and n == 0 and use_header: header = line outfile.write(header) continue n += 1 if n % chunk_size == 0: outfile.close() yield filename filename = "%s/%010i.in" % (prefix, n) outfile = iotools.openFile(filename, "w") if header: outfile.write(header) outfile.write(line) outfile.close() yield filename
def makeCpgIslandsBed(outfile): infile = PARAMS["methylation_summary_cpgislands"] out = iotools.openFile(outfile, "w") with iotools.openFile(infile, "r") as f: for line in f.readlines(): # this assumes location of req. values contig, start, end = line.split()[1:4] if not contig == "chrom": out.write("%s\t%s\t%s\n" % (contig, start, end)) out.close()
def make1basedCpgIslands(infile, outfile): # outfile, loadfile = outfiles out = iotools.openFile(outfile, "w") out.write("%s\t%s\t%s\n" % ("contig", "position", "cpgi")) with iotools.openFile(infile, "r") as f: lines = f.readlines() for line in lines: contig, start, stop = line.split() for position in [x for x in range(int(start), int(stop) + 2)]: out.write("%s\t%s\t%s\n" % (contig, position, "CpGIsland")) out.close()
def __call__(self, filenames, outfile, options): for fi, fn in filenames: E.debug("# merging %s" % fn) infile = iotools.openFile(fn, "r") if options.output_header: self.parseHeader(infile, outfile, options) for l in infile: nfields = l.count("\t") if l[0] == "#": options.stdlog.write(l) elif self.nfields is not None and nfields != self.nfields: # validate number of fields in row, raise warning # for those not matching and skip. E.warn( "# line %s has unexpected number of fields: %i != %i" % (l[:-1], nfields, self.nfields)) else: if self.mFieldIndex is not None: data = l[:-1].split("\t") try: data[self.mFieldIndex] = self.mMapper( fi, data[self.mFieldIndex]) except IndexError: raise IndexError("can not find field %i in %s" % (self.mFieldIndex, l)) l = "\t".join(data) + "\n" outfile.write(l) infile.close()
def filterByCoverage(infiles, outfile): fcoverage = PARAMS["coverage_filter"] contig_file = infiles[0] dbh = sqlite3.connect( os.path.join(PARAMS["results_resultsdir"], PARAMS["database_name"])) cc = dbh.cursor() contigs = set() for infile in infiles[1:]: dirsplit = infile.split("/") infile = os.path.join( PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1]) tablename = P.toTable(os.path.basename(infile)) if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"): statement = """SELECT contig_id ave FROM (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id) WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"]) for data in cc.execute(statement).fetchall(): contigs.add(data[0]) outf = open(outfile, "w") print(contigs) for fasta in FastaIterator.iterate(iotools.openFile(contig_file)): identifier = fasta.title.split(" ")[0] if identifier in contigs: outf.write(">%s\n%s\n" % (identifier, fasta.sequence)) outf.close()
def getMappedReads(infile): '''return number of reads mapped. ''' for lines in iotools.openFile(infile, "r"): data = lines[:-1].split("\t") if data[1].startswith("without duplicates"): return int(data[0]) return
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) dir2files = {} for root, directory, files in os.walk("."): dir2files[root] = files ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S') filename = "CWD_%s" % st E.info("outputting directory state to %s" % filename) with iotools.openFile(filename, "w") as outf: outf.write("##contents of cwd on %s\n\n" % st) for directory, files in dir2files.items(): for file in files: path = os.path.join(directory, file) outf.write(path + "\n") # write footer and output benchmark information. E.Stop()
def buildGeneOntology(infile, outfile): '''create an output file akin to GO ontology files to be used with GO.py ''' table = P.toTable(infile) columns = ("cpg", "tata") dbh = connect() cc = dbh.cursor() outf = iotools.openFile(outfile, "w") outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n") i = 1 for c in columns: cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s" % locals()) outf.write("".join([ "promotor\t%s\tGO:%07i\twith_%s\tNA\n" % (x[0], i, c) for x in cc ])) i += 1 cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s = 0" % locals()) outf.write("".join([ "promotor\t%s\tGO:%07i\twithout_%s\tNA\n" % (x[0], i, c) for x in cc ])) i += 1 outf.close()
def exportPeaksAsBed(infile, outfile): '''export peaks as bed files.''' dbhandle = sqlite3.connect(PARAMS["database_name"]) if infile.endswith("_macs.load"): track = infile[:-len("_macs.load")] else: track = infile[:-len("_intervals.load")] if track.startswith("control"): return peakwidth = PARAMS["peakwidth"] cc = dbhandle.cursor() statement = '''SELECT contig, peakcenter - %(peakwidth)i, peakcenter + %(peakwidth)i, interval_id, peakval FROM %(track)s_intervals ORDER by contig, start''' % locals() cc.execute(statement) outs = iotools.openFile(outfile, "w") for result in cc: contig, start, end, interval_id, peakval = result # peakval is truncated at a 1000 as this is the maximum permitted # score in a bed file. peakval = int(min(peakval, 1000)) outs.write("%s\t%i\t%i\t%s\t%i\n" % (contig, start, end, str(interval_id), peakval)) cc.close() outs.close()
def buildSimpleNormalizedBAM(infiles, outfile, nreads): '''normalize a bam file to given number of counts by random sampling ''' infile, countfile = infiles pysam_in = pysam.Samfile(infile, "rb") fh = iotools.openFile(countfile, "r") readcount = int(fh.read()) fh.close() threshold = float(nreads) / float(readcount) pysam_out = pysam.Samfile(outfile, "wb", template=pysam_in) # iterate over mapped reads thinning by the threshold ninput, noutput = 0, 0 for read in pysam_in.fetch(): ninput += 1 if random.random() <= threshold: pysam_out.write(read) noutput += 1 pysam_in.close() pysam_out.close() pysam.index(outfile) E.info("buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" % (ninput, noutput, 100.0 * noutput / ninput, nreads))
def renameTranscriptsInPreviousSets(infile, outfile): ''' transcripts need to be renamed because they may use the same cufflinks identifiers as we use in the analysis - don't do if they have an ensembl id - sort by transcript ''' inf = iotools.openFile(infile) for gtf in GTF.iterator(inf): if gtf.gene_id.find("ENSG") != -1: statement = '''zcat %(infile)s | grep -v "#" | cgat gtf2gtf --method=sort --sort-order=gene --log=%(outfile)s.log | gzip > %(outfile)s''' else: gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz") transcript_pattern = gene_pattern.replace("GEN", "TRAN") statement = ''' zcat %(infile)s | cgat gtf2gtf --method=renumber-genes --pattern-identifier=%(gene_pattern)s%%i | cgat gtf2gtf --method=renumber-transcripts --pattern-identifier=%(transcript_pattern)s%%i | cgat gtf2gtf --method=sort --sort-order=gene --log=%(outfile)s.log | gzip > %(outfile)s''' P.run()
def buildTrueTaxonomicRelativeAbundances(infiles, outfile): ''' get species level relative abundances for the simulateds data. This involes creating maps between different identifiers from the NCBI taxonomy. This is so that the results are comparable to species level analysis from metaphlan ''' levels = ["species", "genus", "family", "order", "class", "phylum"] taxa = open(infiles[1]) header = taxa.readline() gi2taxa = collections.defaultdict(list) for line in taxa.readlines(): data = line[:-1].split("\t") gi, strain, species, genus, family, order, _class, phylum = data[ 0], data[1], data[2], data[3], data[4], data[5], data[6], data[7] gi2taxa[gi] = (species, genus, family, order, _class, phylum) outf = open(outfile, "w") outf.write("level\ttaxa\trelab\n") for i in range(len(levels)): total = 0 result = collections.defaultdict(int) for fastq in Fastq.iterate(iotools.openFile(infiles[0])): total += 1 gi = fastq.identifier.split("|")[1] result[gi2taxa[gi][i]] += 1 for taxa, value in result.items(): outf.write("%s\t%s\t%s\n" % (levels[i], taxa, float(value) / total)) outf.close()
def __call__(self, filenames, outfile, options): for fi, fn in filenames: infile = iotools.openFile(fn, "r") outfile.write( "######### logging output for %s ###################\n" % fi) for l in infile: outfile.write(l) infile.close()
def summarizePeaksForPooledPseudoreplicates(infiles, outfile): outf = iotools.openFile(outfile, "w") outf.write("Sample_id\t" "Experiment\t" "Tissue\t" "Condition\t" "Pseudoreplicate\t" "n_peaks\n") IDR.countPeaks(infiles, outf)
def writeContigSizes(genome, outfile): '''write contig sizes to outfile for UCSC tools. ''' outf = iotools.openFile(outfile, "w") fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], genome)) for contig, size in fasta.getContigSizes(with_synonyms=False).items(): outf.write("%s\t%i\n" % (contig, size)) outf.close()
def chunk_iterator_regex_group(infile, args, prefix, use_header=False): """group by regular expression is true. Entries need to be consecutive. """ rex = args[0] column = args[1] chunk_size = args[2] last = None header = None n = chunk_size outfile = None filename = None for line in infile: if line[0] == "#": continue if not header and use_header: header = line continue try: this = rex.search(line[:-1]).groups()[0] except IndexError: if outfile: outfile.write(line) continue except AttributeError: if outfile: outfile.write(line) continue if last != this and n >= chunk_size: if last: outfile.close() yield filename last = this filename = "%s/%s.in" % (prefix, this) outfile = iotools.openFile(filename, "w") if header: outfile.write(header) n = 0 outfile.write(line) n += 1 if outfile: outfile.close() yield filename
def makeIntervalCorrelation(infiles, outfile, field, reference): '''compute correlation of interval properties between sets ''' dbhandle = sqlite3.connect(PARAMS["database_name"]) tracks, idx = [], [] for infile in infiles: track = P.snip(infile, ".bed.gz") tablename = "%s_intervals" % P.tablequote(track) cc = dbhandle.cursor() statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals( ) cc.execute(statement) ix = IndexedGenome.IndexedGenome() for contig, start, end, peakval in cc: ix.add(contig, start, end, peakval) idx.append(ix) tracks.append(track) outs = iotools.openFile(outfile, "w") outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n") for bed in Bed.iterator(infile=iotools.openFile(reference, "r")): row = [] for ix in idx: try: intervals = list(ix.get(bed.contig, bed.start, bed.end)) except KeyError: row.append("") continue if len(intervals) == 0: peakval = "" else: peakval = str((max([x[2] for x in intervals]))) row.append(peakval) outs.write(str(bed) + "\t" + "\t".join(row) + "\n") outs.close()
def readHierarchy(mapfile): ''' read hierachy into dictionary ''' hierarchy = collections.defaultdict(list) inf = iotools.openFile(mapfile) inf.readline() for line in inf.readlines(): data = line.strip("\n").split("\t") kingdom = data[0] hierarchy[kingdom].extend(data[1:]) return hierarchy
def buildMRBed(infile, outfile): '''output bed6 file with methylated regions. All regions are output, even the insignificant ones. The score is the log fold change. ''' outf = iotools.openFile(outfile, "w") c = E.Counter() for row in csv.DictReader(iotools.openFile(infile), dialect="excel-tab"): c.input += 1 contig, start, end = re.match("(.*):(\d+)-(\d+)", row["interval_id"]).groups() c.output += 1 outf.write("\t".join((contig, start, end, str(c.input), row["lfold"])) + "\n") outf.close() E.info("%s" % str(c))
def findNPeaks(infiles, outfile, params): outf = iotools.openFile(outfile, "w") outf.write("Tissue\t" "Condition\t" "Experiment\t" "idr_comp\t" "sample_1\t" "sample_2\t" "n_peaks\n") idr_threshold = float(params[0]) # Hack: for only one infile, P.submit returns a string rather than a list if type(infiles) is str: infiles = [ infiles, ] for inf in infiles: inf_name = P.snip(os.path.basename(inf), "-overlapped-peaks.txt") tissue = inf_name.split("-")[0] condition = inf_name.split("-")[1] experiment = "_".join([tissue, condition]) sample1, sample2 = inf_name.split("_vs_") n_peaks = 0 header = True for line in iotools.openFile(inf): if header: header = False continue line = line.split() if float(line[10]) <= idr_threshold: n_peaks += 1 else: continue outf.write(tissue + "\t" + condition + "\t" + experiment + "\t" + inf_name + "\t" + sample1 + "\t" + sample2 + "\t" + str(n_peaks) + "\n") outf.close()
def countPeaks(infiles, outf): """ Count the number of peaks in each narrowPeak file """ for infile in infiles: sample_id = os.path.basename(infile).split("_VS_")[0] tissue, condition, replicate = sample_id.split("-") experiment = tissue + "_" + condition n_peaks = str(len(iotools.openFile(infile).readlines())) outf.write("\t".join( [sample_id, experiment, tissue, condition, replicate, n_peaks]) + "\n") outf.close()
def __call__(self, filenames, outfile, options): for fi, fn in filenames: infile = iotools.openFile(fn, "r") for l in infile: if l[0] == "#": options.stdlog.write(l) continue elif l[0] == ">": x = re.search(">(\S+)", l[:-1]) id = self.mMapper(fi, x.groups()[0]) l = ">%s%s" % (id, l[x.end(0):]) outfile.write(l) infile.close()
def aggregateTiledReadCounts(infiles, outfile): '''aggregate tag counts for each window. coverageBed outputs the following columns: 1) Contig 2) Start 3) Stop 4) Name 5) The number of features in A that overlapped (by at least one base pair) the B interval. 6) The number of bases in B that had non-zero coverage from features in A. 7) The length of the entry in B. 8) The fraction of bases in B that had non-zero coverage from features in A. For bed: use column 5 For bed6: use column 7 For bed12: use column 13 This method uses the maximum number of reads found in any interval as the tag count. Tiles with no counts will not be output. ''' to_cluster = True src = " ".join([ '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$4 );}' ) ''' % x for x in infiles ]) tmpfile = P.getTempFilename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run() tracks = [re.sub("\..*", '', os.path.basename(x)) for x in infiles] outf = iotools.openFile(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) for line in open(tmpfile, "r"): data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] if sum(values) == 0: continue assert len( genes ) == 1, "paste command failed, wrong number of genes per line: '%s'" % line outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) outf.close() os.unlink(tmpfile)
def summarizeMACSFDR(infiles, outfile): '''compile table with peaks that would remain after filtering by fdr. ''' fdr_thresholds = numpy.arange(0, 1.05, 0.05) outf = iotools.openFile(outfile, "w") outf.write("track\t%s\n" % "\t".join(map(str, fdr_thresholds))) for infile in infiles: called = [] track = P.snip(os.path.basename(infile), ".macs") infilename = infile + "_peaks.xls.gz" inf = iotools.openFile(infilename) peaks = list(WrapperMACS.iteratePeaks(inf)) for threshold in fdr_thresholds: called.append(len([x for x in peaks if x.fdr <= threshold])) outf.write("%s\t%s\n" % (track, "\t".join(map(str, called)))) outf.close()
def normaliseKraken(infile, outfile): ''' normalise kraken counts by nreads/million mapped ''' inf = iotools.openFile(infile) header = inf.readline().replace("rel_abundance", "rpm") mapped = 0 # will have to iterate over the file twice for line in inf.readlines(): data = line[:-1].split("\t") count = int(data[-1]) mapped += count inf.close() inf = iotools.openFile(infile) inf.readline() outf = iotools.openFile(outfile, "w") outf.write(header) for line in inf.readlines(): data = line[:-1].split("\t") count = int(data[-1]) / (float(mapped) / 1000000) outf.write("\t".join(map(str, data[:-1] + [count])) + "\n") outf.close()
def buildQuicksectMask(bed_file): '''return Quicksect object containing the regions specified takes a bed file listing the regions to mask ''' mask = IndexedGenome.Quicksect() n_regions = 0 for bed in Bed.iterator(iotools.openFile(bed_file)): # it is neccessary to extend the region to make an accurate mask mask.add(bed.contig, (bed.start - 1), (bed.end + 1), 1) n_regions += 1 E.info("Built Quicksect mask for %i regions" % n_regions) return(mask)
def buildExpectedCoverageOverGenomes(infiles, outfile): ''' take sequence files and estimate the theoretical coverage we would get over genomes in the sample i.e. at 1X coverage ''' # if paired end then will have to multiply # by two multiply = False if infiles[0].endswith(".fastq.1.gz"): multiply = True # the theoretical coverage is defined as # (read length (L) * no. reads (N)) / genome size (G) (bp) # get genome sizes into memory genomes = open(infiles[1]) header = genomes.readline() genome_sizes = {} for line in genomes.readlines(): data = line[:-1].split("\t") gi = data[0].split("_")[1] size = data[1] genome_sizes[gi] = size # get the expected genome size expected_genome_sizes = collections.defaultdict(int) E.info("iterating over fastq file") for fastq in Fastq.iterate(iotools.openFile(infiles[0])): gi = fastq.identifier.split("|")[1] expected_genome_sizes[gi] += 1 E.info("iterating over fastq file: DONE") # get the proportion of each genome covered outf = open(outfile, "w") outf.write("gi\texpected_coverage\n") for gi, size in expected_genome_sizes.items(): if multiply: size = size * 2 if gi not in genome_sizes: E.warn("could not find gi no. %s in dictionary" % gi) continue proportion_coverage = float(size) / float(genome_sizes[gi]) if proportion_coverage > 1: proportion_coverage = 1 outf.write("%s\t%f\n" % (gi, proportion_coverage)) outf.close()
def makeExpressionSummaryPlots(counts_inf, design_inf, logfile): ''' use the plotting methods for Counts object to make summary plots''' with iotools.openFile(logfile, "w") as log: plot_prefix = P.snip(logfile, ".log") # need to manually read in data as index column is not the first column counts = Counts.Counts(pd.read_table(counts_inf, sep="\t")) counts.table.set_index(["transcript_id"]) design = Expression.ExperimentalDesign(design_inf) # make certain counts table only include samples in design counts.restrict(design) cor_outfile = plot_prefix + "_pairwise_correlations.png" pca_var_outfile = plot_prefix + "_pca_variance.png" pca1_outfile = plot_prefix + "_pc1_pc2.png" pca2_outfile = plot_prefix + "_pc3_pc4.png" heatmap_outfile = plot_prefix + "_heatmap.png" counts_log10 = counts.log(base=10, pseudocount=0.1, inplace=False) counts_highExp = counts_log10.clone() counts_highExp.table['order'] = counts_highExp.table.apply( np.mean, axis=1) counts_highExp.table.sort(["order"], ascending=0, inplace=True) counts_highExp.table = counts_highExp.table.iloc[0:500, :] counts_highExp.table.drop("order", axis=1, inplace=True) log.write("plot correlations: %s\n" % cor_outfile) counts_log10.plotPairwiseCorrelations(cor_outfile, subset=1000) log.write("plot pc3,pc4: %s\n" % pca1_outfile) counts_log10.plotPCA(design, pca_var_outfile, pca1_outfile, x_axis="PC1", y_axis="PC2", colour="group", shape="group") log.write("plot pc3,pc4: %s\n" % pca2_outfile) counts_log10.plotPCA(design, pca_var_outfile, pca2_outfile, x_axis="PC3", y_axis="PC4", colour="group", shape="group") log.write("plot heatmap: %s\n" % heatmap_outfile) counts_highExp.heatmap(heatmap_outfile)