def __init__(self, infile, *args, **kwargs): self.gtf = GTF.iterator(IOTools.open_file(infile, "r"))
def runMAST(infiles, outfile): '''run mast on all intervals and motifs. Collect all results for an E-value up to 10000 so that all sequences are output and MAST curves can be computed. 10000 is a heuristic. ''' # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles if IOTools.is_empty(dbfile) or len(motiffiles) == 0: IOTools.touch_file(outfile) return if not os.path.exists(controlfile): raise ValueError("control file %s for %s does not exist" % (controlfile, dbfile)) # remove previous results if os.path.exists(outfile): os.remove(outfile) tmpdir = P.get_temp_dir(".") tmpfile = P.get_temp_filename(".") for motiffile in motiffiles: if IOTools.is_empty(motiffile): L.info("skipping empty motif file %s" % motiffile) continue of = IOTools.open_file(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - foreground ::\n" % motif) of.close() # mast bails if the number of nucleotides gets larger than # 2186800982? # To avoid this, run db and control file separately. statement = ''' cat %(dbfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run(statement) of = IOTools.open_file(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - background ::\n" % motif) of.close() statement = ''' cat %(controlfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run(statement) P.run("gzip < %(tmpfile)s > %(outfile)s") shutil.rmtree(tmpdir) os.unlink(tmpfile)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--bam-file", dest="bam_file", type="string", help="supply input bam file name") parser.add_option("-g", "--gtf-file", dest="gtf_file", type="string", help="supply input gtf file name") parser.add_option("-o", "--outfile", dest="outfile", type="string", help="supply output file name") parser.add_option( "-G", "--reference-gtf-file", dest="reference_gtf", type="string", help= "supply reference gtf for context of reads not contributing to transcripts" ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) ###################################################### ###################################################### # for all alignments ###################################################### ###################################################### # open outfile and prepare headers outf = IOTools.open_file(options.outfile, "w") outf.write("\t".join([ "total alignments", "aligments in transcripts", "percent alignments in transcripts", "total spliced alignments", "spliced alignments in transcripts", "percent spliced alignments in transcripts" ]) + "\n") # calculate coverage over transcript file - NB split reads contribute twice to the transcript # use BedTool object pybedbamfile = pybedtools.BedTool(options.bam_file) # count alignments E.info("counting total number of alignments and spliced alignments") total_alignments = 0 spliced_alignments = 0 for alignment in pybedbamfile: cigar = alignment[5] if cigar.find("N") != -1: # N signifies split read total_alignments += 1 spliced_alignments += 1 else: total_alignments += 1 # merge the gtf file to avoid double counting of exons in different # transcripts - converts to a bed file gtffile = pybedtools.BedTool(options.gtf_file).merge() E.info("computing coverage of aligments in %s over intervals in %s" % (options.bam_file, options.gtf_file)) cover = pybedbamfile.coverage(gtffile) # make sure that the exons aren't being counted twice - shouldn't be # because of merge E.info("counting reads contributing to transcripts") c = 0 for entry in cover: coverage = int(entry[3]) if coverage > 0: c += coverage # sum the coverage across exons from all transcripts coverage_in_transcripts = c ###################################################### ###################################################### # for spliced alignments ###################################################### ###################################################### # count total number of spliced alignments # requires that the CIGAR string 'N' is present # uses pysam to write out a bam file of the spliced reads only allreads = pysam.AlignmentFile(options.bam_file) spliced_bamname = IOTools.snip(options.bam_file, ".bam") + "_spliced_reads.bam" # open file for outputting spliced alignments splicedreads = pysam.AlignmentFile(spliced_bamname, "wb", template=allreads) # cigar string in pysam for spliced alignment is (3, int) spliced = collections.defaultdict(list) for read in allreads: for cigar_tag in read.cigar: if cigar_tag[0] == 3: spliced[read].append(cigar_tag) # write out spliced alignments for read in list(spliced.keys()): splicedreads.write(read) splicedreads.close() allreads.close() # index splice reads bam file pysam.sort(spliced_bamname, P.snip(spliced_bamname, ".bam")) pysam.index(spliced_bamname) # read in the spliced reads as a BedTool object splicedbam = pybedtools.BedTool(spliced_bamname) # perform coverage of spliced reads over intervals - will be twice # as many as there should be due to counting both exons # overlapping spliced_coverage = splicedbam.coverage(gtffile) # avoid double counting exons E.info("counting spliced reads contributing to transcripts") spliced_exons = {} c = 0 for entry in spliced_coverage: coverage = int(entry[3]) if coverage > 0: c += coverage spliced_coverage_in_transcripts = c # NOTE: the counting of spliced alignments is not accurate spliced_coverage_in_transcripts = float( spliced_coverage_in_transcripts) / 2 ########################### # write out the results ########################### outf.write(str(int(total_alignments)) + "\t") # remove half of the coverage assigned to spliced reads coverage_in_transcripts = (coverage_in_transcripts) - ( spliced_coverage_in_transcripts) outf.write( str( int(coverage_in_transcripts) - int(spliced_coverage_in_transcripts)) + "\t") outf.write( str(int((coverage_in_transcripts / total_alignments) * 100)) + "\t") # write out spliced counts outf.write(str(int(spliced_alignments)) + "\t") outf.write(str(int(spliced_coverage_in_transcripts)) + "\t") outf.write( str(int((spliced_coverage_in_transcripts / spliced_alignments) * 100))) outf.close() ############################ # contextualise those that # don't fall in transcripts ############################ if options.reference_gtf: context_summary = IOTools.open_file( IOTools.snip(options.bam_file, ".bam") + ".excluded.context", "w") context_summary.write("\t".join(["Feature", "number"]) + "\n") # write out the read info as well context_file = IOTools.open_file( IOTools.snip(options.bam_file, ".bam") + ".excluded", "w") context_dict = collections.defaultdict(int) # intersect bam - write non-overlapping with transcripts - intersect # with reference - write out context = pybedbamfile.intersect(gtffile, v=True, bed=True).intersect( pybedtools.BedTool(options.reference_gtf), wb=True) for entry in context: feature = entry[8] context_dict[feature] += 1 context_file.write("\t".join([e for e in entry]) + "\n") for feature, value in context_dict.items(): context_summary.write("\t".join([feature, str(value)]) + "\n") context_file.close() context_summary.close() # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: data2multiple_anova.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take for calculating histograms.") parser.add_option("-t", "--tree-nh-file", dest="filename_tree", type="string", help="filename with tree(s).") parser.add_option("--skip-header", dest="add_header", action="store_false", help="do not add header to flat format.") parser.add_option("--output-with-header", dest="write_header", action="store_true", help="write header and exit.") parser.add_option("--debug", dest="debug", action="store_true", help="debug mode") parser.add_option("--display-tree", dest="display_tree", action="store_true", help="display the tree") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("contrasts", "spearman", "pearson", "compute"), help="methods to perform on contrasts.") parser.set_defaults( columns="all", filename_tree=None, add_header=True, write_header=False, debug=False, methods=[], value_format="%6.4f", pvalue_format="%e", display_tree=False, ) (options, args) = E.start(parser, quiet=True) if options.columns not in ("all", "all-but-first"): options.columns = [int(x) - 1 for x in options.columns.split(",")] data = [] options.filenames = args for filename in options.filenames: infile = IOTools.open_file(filename, "r") table, headers = IOTools.readTable( infile, take=options.columns, headers=False) infile.close() data.append(table) fields = ["Df", "Sum Sq", "F value", "Pr(>F)", "Mean Sq"] options.stdout.write("set1\tset2") for field in fields: options.stdout.write("\t%s" % field) options.stdout.write("\n") # CODE needs to be refactored for rpy2 usage for x in range(len(data)): for y in range(x + 1, len(data)): rpy.set_default_mode(rpy.NO_CONVERSION) factors = ["x"] * len(data[x][:, 0]) + ["y"] * len(data[y][:, 0]) values = list(data[x][:, 0]) + list(data[y][:, 0]) linear_model = R.lm( R("y ~ x"), data=R.data_frame(x=factors, y=values)) rpy.set_default_mode(rpy.BASIC_CONVERSION) result = R.anova(linear_model) options.stdout.write( "%s\t%s" % (options.filenames[x], options.filenames[y])) for field in fields: options.stdout.write("\t%s" % str(result[field])) options.stdout.write("\n")
def loadGLAM2SCAN(infile, outfile): '''parse mast file and load into database. Parse several motif runs and add them to the same table. ''' tmpfile = tempfile.NamedTemporaryFile(delete=False) tmpfile.write( "motif\tid\tnmatches\tscore\tscores\tncontrols\tmax_controls\n") lines = IOTools.open_file(infile).readlines() chunks = [x for x in range(len(lines)) if lines[x].startswith("::")] chunks.append(len(lines)) for chunk in range(len(chunks) - 1): # use real file, as parser can not deal with a # list of lines try: motif = re.match(":: motif = (\S+) ::", lines[chunks[chunk]]).groups()[0] except AttributeError: raise ValueError("parsing error in line '%s'" % lines[chunks[chunk]]) if chunks[chunk] + 1 == chunks[chunk + 1]: L.warn("no results for motif %s - ignored" % motif) continue tmpfile2 = tempfile.NamedTemporaryFile(delete=False) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() glam = Glam2Scan.parse(IOTools.open_file(tmpfile2.name, "r")) os.unlink(tmpfile2.name) # collect control data full_matches = collections.defaultdict(list) controls = collections.defaultdict(list) for match in glam.matches: m = match.id.split("_") track, id = m[:2] if len(m) == 2: full_matches[id].append(match) else: controls[id].append(match.score) for id, matches in full_matches.items(): nmatches = len(matches) scores = [x.score for x in matches] score = max(scores) # move to genomic coordinates # contig, start, end = re.match( "(\S+):(\d+)..(\d+)", match.id).groups() # start, end = int(start), int(end) # match.start += start # match.end += start contig = "" if id not in controls: P.warn("no controls for %s - increase evalue?" % id) c = controls[id] if len(c) == 0: mmax = "" else: mmax = max(c) tmpfile.write("\t".join( map(str, (motif, id, nmatches, score, ",".join(map(str, scores)), len(c), mmax))) + "\n") tmpfile.close() P.load(tmpfile.name, outfile, options="--add-index=id " "--add-index=motif " "--add-index=id,motif " "--allow-empty-file " "--map=base_qualities:text") os.unlink(tmpfile.name)
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-f", "--input-filter-tsv", dest="input_filter_tsv", type="string", help="list with identifiers to remove. " "[%default]") parser.add_option("--set-prefix", dest="set_prefix", type="string", help="set sequence prefix [%default]") parser.add_option("--min-length", dest="min_length", type="int", help="minimum alignment length [%default]") parser.add_option("--method", dest="methods", action="append", choices=("shift-region", ), help="methods to apply [%default]") parser.set_defaults( input_maf_file=None, input_filter_tsv=None, set_prefix=None, min_length=0, methods=[], ) (options, args) = E.start(parser, argv) if options.input_filter_tsv: with IOTools.open_file(options.input_filter_tsv) as inf: skip_id = set([x[:-1] for x in inf]) else: skip_id = False counter = E.Counter() if options.set_prefix: prefix = "s {}".format(options.set_prefix) else: prefix = None for block in iterate_maf_blocks(options.stdin): counter.blocks_input += 1 if skip_id: if block[2].startswith("s "): id = re.match("s (\S+)", block[2]).groups()[0] if id in skip_id: counter.blocks_skipped_id += 1 continue if options.min_length: if block[2].startswith("s "): id, pos, length = re.match("s (\S+)\s+(\d+)\s+(\d+)", block[2]).groups() if int(length) <= options.min_length: counter.blocks_skipped_length += 1 continue if prefix: block[2] = prefix + block[2][4:] if block[2].startswith("s "): header, ali1, ali2, qual = parse_block(block) if "shift-region" in options.methods: rows = [] contig, start, end = parse_region_string(ali1.src) ali1 = ali1._replace(src=contig, start=start + ali1.start) rows.append(list(map(str, ali1))) rows.append(list(map(str, ali2))) if qual: rows.append(list(map(str, qual))) lines = [header] lines.append(format_tabular(rows, "llrrrrl")) lines.append("\n") block = lines counter.blocks_output += 1 options.stdout.write("".join(block)) E.info(counter) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("--bin", dest="bin", action="store_true", help="output average in bins across the interval") parser.add_option("-n", "--num-bins", dest="bin_number", type=int, help="number of bins for coverage profile") parser.add_option("-o", "--output-filename-prefix", dest="output_filename_prefix", help="pattern to write coverage bins to") parser.set_defaults(bin=False, bin_number=10) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) inf = options.stdin coverage_result = collections.defaultdict(list) E.info("reading in coverage data") for line in inf.readlines(): data = line[:-1].split("\t") contig, coverage = data[0], data[2] coverage_result[contig].append(coverage) E.info("read %i contigs" % len(list(coverage_result.keys()))) options.stdout.write("contig\tcov_mean\tcov_sd\n") if options.bin: outf = IOTools.open_file(options.output_filename_prefix + ".binned", "w") outf.write( "%s" % "\t".join([str(i) for i in range(1, options.bin_number + 1, 1)]) + "\n") for contig, coverage in coverage_result.items(): coverage = list(map(float, coverage)) options.stdout.write( "%s\t%s\t%s\n" % (contig, str(np.mean(coverage)), str(np.std(coverage)))) if options.bin: bin_means = [] bins = np.linspace(0, len(coverage), options.bin_number + 1) if len(coverage) < len(bins) - 1: E.warn("will not calculate coverage means for %s: too short" % contig) continue for i in range(len(bins)): try: bin_mean = np.mean(coverage[int(bins[i]):int(bins[i + 1])]) except IndexError: continue bin_means.append(bin_mean) outf.write(contig + "\t" + "\t".join(map(str, bin_means)) + "\n") outf.close() # write footer and output benchmark information. E.stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--input-filename-fasta", dest="input_filename_fasta", type="string", help="filename with reference sequence in fasta format [%default]") parser.add_option("--input-filename-bam", dest="input_filename_bam", type="string", help="filename with aligned reads [%default]") parser.add_option("--method", dest="methods", type="choice", action="append", choices=["add-strelka-genotype", "lift-over"], help="methods to apply [%default]") parser.add_option( "--input-filename-chain", dest="input_filename_chain", type="string", help="filename with alignment chain for lift-over [%default]") parser.add_option( "--normal-sample-regex", dest="normal_sample_regex", type="string", help="regular expression to apply to header to identify normal " "sample id [%default]") parser.add_option( "--output-filename-unmapped", dest="output_filename_unmapped", type="string", help="filename with variants that could not be lifted over [%default]") parser.set_defaults( input_filename_fasta=None, input_filename_bam=None, input_filename_vcf="-", sample_size=0.001, region_size=20, methods=[], normal_sample_regex=None, input_filename_chain=None, output_filename_unmapped=None, ) (options, args) = E.start(parser, argv=argv, add_output_options=True) if len(args) > 0: options.input_filename_vcf = args[0] vcf_in = pysam.VariantFile(options.input_filename_vcf) if "lift-over" in options.methods: if options.input_filename_chain is None: raise ValueError( "--method=lift-over requires --input-filename-chain") if not os.path.exists(options.input_filename_chain): raise OSError("file {} with chain data does not exist".format( options.input_filename_chain)) E.info("reading chain from {}".format(options.input_filename_chain)) with IOTools.open_file(options.input_filename_chain) as inf: map_chain, map_contig2length = read_liftover_chain(inf) if options.input_filename_fasta: fasta = pysam.FastaFile(options.input_filename_fasta) else: fasta = None if options.input_filename_bam: bam = pysam.AlignmentFile(options.input_filename_bam) else: bam = None outf = options.stdout c = E.Counter() if "add-strelka-genotype" in options.methods: map_nt2gt = {"ref": "0/0", "het": "0/1", "hom": "1/1", "conflict": "."} map_tumour2gt = {"ref": "0/0", "het": "0/1", "hom": "1/1"} header = str(vcf_in.header).splitlines() header.insert( len(header) - 1, '##FORMAT=<ID=GT,Number=1,Type=String,Description=' '"Genotypes of reference and alternative alleles, ' 'added by CGATCore vcf2vcf.">') header = "\n".join(header) if options.normal_sample_regex: normal_sample = re.search(" -bam-file \S+/([^/]+)_S\d+.bam", header).groups()[0] else: normal_sample = "NORMAL" is_first = True for record in vcf_in: c.input += 1 if "GT" in record.format: if is_first: outf.write(header + "\n") is_first = False outf.write(str(record)) c.has_gt += 1 continue gt_normal = map_nt2gt[record.info["NT"]] gt_tumour = record.info["SGT"] norm, tumour = gt_tumour.split("->") if gt_tumour[0] in "ACGT": alts = record.alts if alts is None: c.no_alt += 1 continue if len(record.alts) > 1: c.multi_allelic += 1 continue _map_tumour2gt = {record.alts[0]: "1", record.ref: "0"} try: gt_tumour = "/".join( sorted([_map_tumour2gt[x] for x in tumour])) except KeyError: gt_tumour = "." c.ambigous_genotype += 1 else: gt_tumour = map_tumour2gt[tumour] fields = str(record)[:-1].split("\t") # FORMAT fields[8] = ":".join(("GT", fields[8])) # SAMPLES # makes a few assumptions, fix! header_insert_normal = False if len(fields) == 11: fields[9] = ":".join((gt_normal, fields[9])) fields[10] = ":".join((gt_tumour, fields[10])) elif len(fields) == 10: header_insert_normal = True values = fields[9].split(":") fields.append(":".join((gt_tumour, fields[9]))) fields[9] = ":".join([gt_normal] + ["."] * len(values)) else: raise NotImplementedError() if is_first: if not header_insert_normal: outf.write(header + "\n") else: header = re.sub(r"\tFORMAT\t", "\tFORMAT\t%s\t" % normal_sample, header) outf.write(header + "\n") is_first = False outf.write("\t".join(fields) + "\n") c.output += 1 elif "lift-over" in options.methods: header = str(vcf_in.header).splitlines() if fasta: # validate contig size expected_lengths = dict(list(zip(fasta.references, fasta.lengths))) else: expected_lengths = map_contig2length # update contig names and sizes in VCF header header = [x for x in header if not x.startswith("##contig")] header[-1:-1] = [ "##contig=<ID={},length={}>".format(contig, length) for contig, length in sorted(expected_lengths.items()) ] header.insert( len(header) - 1, '##liftover=<CHAIN={},REFERENCE={}>'.format( options.input_filename_chain, options.input_filename_fasta)) outf.write("\n".join(header) + "\n") unmapped_contigs = set() unknown_contigs = set() trans_genotypes = str.maketrans("01", "10") if fasta: # validate contig size expected_lengths = dict(list(zip(fasta.references, fasta.lengths))) for contig, length in list(map_contig2length.items()): if contig in expected_lengths: if length != expected_lengths[contig]: raise ValueError( "contig lengths mismatch. For contig {} chain files " "says {}, but fasta files says {}".format( contig, length, expected_lengths[contig])) E.info("contig sizes in chain file and fasta files correspond.") if options.output_filename_unmapped: outfile_unmapped = IOTools.open_file( options.output_filename_unmapped, "w") outfile_unmapped.write("\n".join(header) + "\n") else: outfile_unmapped = None for record in vcf_in: c.input += 1 try: mm = map_chain[record.contig] except KeyError: c.skipped_unmapped_contig += 1 unmapped_contigs.add(record.contig) if outfile_unmapped: outfile_unmapped.write( "skipped_unmapped_contig\t{}".format(str(record))) continue try: m = mm.search(record.start, record.stop) except AttributeError: c.skipped_mapping_error += 1 if outfile_unmapped: outfile_unmapped.write("skipped_mapping_error\t{}".format( str(record))) continue if len(m) == 0: c.skipped_unmapped_position += 1 if outfile_unmapped: outfile_unmapped.write( "skipped_unmapped_position\t{}".format(str(record))) continue elif len(m) > 1: c.skipped_multimapping_position += 1 if outfile_unmapped: outfile_unmapped.write( "skipped_multimapping_position\t{}".format( str(record))) continue m = m[0] y_contig, y_start, y_end, y_invert = m.data if y_invert: y_pos = y_end - (record.start - m.start) else: y_pos = (record.start - m.start) + y_start if fasta: try: ref_base = fasta.fetch(y_contig, y_pos, y_pos + len(record.ref)).upper() except KeyError: c.skipped_unknown_contig += 1 unknown_contigs.add(y_contig) ref_base = None continue swap_alleles = False if ref_base: error = False if ref_base == record.ref: c.matches += 1 else: if len(record.alts) == 1: alt_base = record.alts[0] if ref_base == alt_base: swap_alleles = True c.allele_swap_variant += 1 else: c.error_mismatch_variant += 1 error = "mismatch" else: error = "multi-mismatch" c.error_multi_mismatch_variant += 1 if error: if outfile_unmapped: outfile_unmapped.write("{}\t{}".format( error, str(record))) c.skipped_error_variant += 1 continue fields = str(record)[:-1].split("\t") fields[0] = y_contig fields[1] = str(y_pos) if swap_alleles: fields[4] = alt_base fields[5] = ref_base # update genotype fields keep = False for idx in range(9, len(fields)): gt, rest = fields[idx].split(":", 1) keep = keep or "0" in gt fields[idx] = ":".join( (gt.translate(trans_genotypes), rest)) # remove reference only calls if not keep: if outfile_unmapped: outfile_unmapped.write("reference_call\t{}".format( str(record))) c.skipped_allele_swap_reference += 1 continue c.output += 1 outf.write("\t".join(fields) + "\n") c.unmapped_contigs = len(unmapped_contigs) c.unknown_contigs = len(unknown_contigs) E.info(c.asTable()) if unknown_contigs: E.info("unknown contigs: {}".format(",".join( sorted(unknown_contigs)))) if unmapped_contigs: E.info("unmapped contigs: {}".format(",".join( sorted(unmapped_contigs)))) E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version", usage=globals()["__doc__"]) parser.add_option("--output-quality-format", dest="q_format", type="int", help="sequence quality format, e.g 33 = +33/Sanger" "[default=%default].") parser.add_option("--output-paired-end", dest="paired", action="store_true", help="generate paired end reads [default = %default].") parser.add_option("--insert-length-mean", dest="insert_mean", type="float", help="mean insert length [default = %default].") parser.add_option( "--insert-length-sd", dest="insert_sd", type="float", help="insert length standard deviation [default = %default].") parser.add_option( "--counts-method", dest="counts_method", type="choice", choices=("reads", "copies"), help="simulate a ground truth number of reads per entry or" "copies per entry [default = %default].") parser.add_option("--counts-min", dest="counts_min", type="float", help="minimum number of reads/read pairs per fasta entry" "or copies per entry [default = %default].") parser.add_option( "--counts-max", dest="counts_max", type="float", help="maximum number of reads/read pairs per fasta entry " "or copies per entry [default = %default].") parser.add_option("--output-read-length", dest="read_length", type="int", help="read length [default = %default].") parser.add_option("--sequence-error-phred", dest="phred", type="int", help="phred quality score [default = %default].") parser.add_option("--output-counts", dest="output_counts", type="string", help="name for counts outfile [default=%default].") parser.add_option( "--output-fastq2", dest="fastq2_out", type="string", help="filename for second fastq outfile [default=%default].") parser.add_option("--premrna-fraction", dest="premrna_fraction", type="float", help="the fraction of reads to simulate from pre-mRNA" "[default= % default].") parser.add_option("--infile-premrna-fasta", dest="premrna_fasta", type="string", help="filename for pre-mRNA fasta[default=%default].") parser.set_defaults(q_format=33, paired=False, insert_mean=0, insert_sd=1, counts_method="reads", counts_min=1, counts_max=1, read_length=50, fastq2_out=None, output_counts=None, phred=30, premrna_fraction=0, premrna_fasta=None) (options, args) = E.start(parser) if options.paired: assert options.fastq2_out, ("must specify a second fastq outfile for " "paired end (--output-fastq2)") outf2 = IOTools.open_file(options.fastq2_out, "w") if options.premrna_fraction: assert options.premrna_fasta, ("must specfify the location of the" "fasta file for the pre-mRNA") # the sequence quality string will always be the same so define here sequence_quality = chr(options.q_format + options.phred) qual = "".join([sequence_quality] * options.read_length) if options.premrna_fraction: iterator = FastaIterator.iterate_together( options.stdin, IOTools.open_file(options.premrna_fasta)) else: iterator = FastaIterator.FastaIterator(options.stdin) # set a cut off of twice the read/pair length for short entries if options.paired: minimum_entry_length = ( 2 * ((options.read_length * 2) + options.insert_mean)) else: minimum_entry_length = 2 * options.read_length c = collections.Counter() counts = collections.Counter() copies = collections.Counter() for f_entry in iterator: if options.premrna_fraction: assert getTitle(f_entry[0]) == getTitle( f_entry[1]), ("entry ids do not match: %s != %s" % (f_entry[0].title, f_entry[1].title)) entry = f_entry[0] pre_entry = f_entry[1] else: entry = f_entry # reject short fasta entries if len(entry.sequence) < minimum_entry_length: E.info("skipping short transcript: %s length=%i" % (entry.title, len(entry.sequence))) c['skipped'] += 1 continue else: c['not_skipped'] += 1 if options.paired: fragment_length = ((2 * options.read_length) + options.insert_mean) else: fragment_length = options.read_length reads_per_entry = float(len(entry.sequence)) / fragment_length if options.counts_method == "reads": n_reads = random.randint(options.counts_min, options.counts_max + 1) n_copies = float(n_reads) / reads_per_entry if options.premrna_fraction: n_reads_pre = int(round(n_reads * options.premrna_fraction)) elif options.counts_method == "copies": # random float [0-1] rand = np.random.random_sample() n_copies = (options.counts_min + (rand * (options.counts_max - options.counts_min))) n_reads = int(round(n_copies * reads_per_entry, 0)) # as n_reads must be rounded to int, need to redefine n_copies n_copies = float(n_reads) / reads_per_entry if options.premrna_fraction: reads_per_pre_entry = (float(len(pre_entry.sequence)) / fragment_length) n_copies_pre = n_copies * options.premrna_fraction n_reads_pre = int(round(n_copies_pre * reads_per_pre_entry, 0)) # as n_reads_pre must be rounded to int, need to # redefine n_copies_pre n_copies_pre = float(n_reads_pre) / reads_per_pre_entry entry_id = getTitle(entry) counts[entry_id] = n_reads copies[entry_id] = n_copies if "N" in entry.sequence.upper(): E.warn("fasta entry %s contains unknown bases ('N')" % entry_id) for i in range(0, n_reads): read = generateRead(entry=entry.sequence.upper(), read_length=options.read_length, error_rate=options.phred, paired=options.paired, insert_mean=options.insert_mean, insert_sd=options.insert_sd) if options.paired: r1, r2 = read h1 = "@%s_%i/1" % (entry_id, i) h2 = "@%s_%i/2" % (entry_id, i) options.stdout.write("\n".join((h1, r1, "+", qual)) + "\n") outf2.write("\n".join((h2, r2, "+", qual)) + "\n") else: h = "@%s_%i/1" % (entry_id, i) options.stdout.write("\n".join((h, read, "+", qual)) + "\n") if options.premrna_fraction: c['pre_counts'] += n_reads_pre c['pre_copies'] += n_copies_pre for i in range(0, n_reads_pre): read = generateRead(entry=pre_entry.sequence.upper(), read_length=options.read_length, error_rate=options.phred, paired=options.paired, insert_mean=options.insert_mean, insert_sd=options.insert_sd) if options.paired: r1, r2 = read h1 = "@%s_pre-mRNA_%i/1" % (entry_id, i) h2 = "@%s_pre-mRNA_%i/2" % (entry_id, i) options.stdout.write("\n".join((h1, r1, "+", qual)) + "\n") outf2.write("\n".join((h2, r2, "+", qual)) + "\n") else: h = "@%s_pre-mRNA_%i/1" % (entry_id, i) options.stdout.write("\n".join((h, read, "+", qual)) + "\n") if options.paired: outf2.close() with IOTools.open_file(options.output_counts, "w") as counts_out: counts_out.write("%s\n" % "\t".join(("id", "read_count", "tpm"))) sum_copies = sum(copies.values()) sum_counts = sum(counts.values()) for entry_id, count in counts.items(): tpm = 1000000 * (float(copies[entry_id]) / sum_copies) counts_out.write("%s\n" % "\t".join(map(str, (entry_id, count, tpm)))) E.info("Reads simulated for %i fasta entries, %i entries skipped" % (c['not_skipped'], c['skipped'])) E.info("Simulated: %i reads (%i mRNA, %i pre-mRNA), " "%f transcripts (%f mRNA, %f pre-mRNA)" % (sum_counts + c['pre_counts'], sum_counts, c['pre_counts'], sum_copies + c['pre_copies'], sum_copies, c['pre_copies'])) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--template-bam-file", dest="filename_genome_bam", type="string", help="input bam file for header information [%default]") parser.add_option("-s", "--contigs-tsv-file", dest="filename_contigs", type="string", help="filename with contig sizes [%default]") parser.add_option( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) [%default]") parser.add_option("-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches [%default]") parser.add_option( "-c", "--remove-contigs", dest="remove_contigs", type="string", help="','-separated list of contigs to remove [%default]") parser.add_option("-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files [%default]") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely [%default]") parser.set_defaults( filename_genome_bam=None, filename_gtf=None, filename_mismapped=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) genomefile, referencenames, referencelengths = None, None, None if options.filename_genome_bam: genomefile = pysam.AlignmentFile(options.filename_genome_bam, "rb") elif options.filename_contigs: contigs = IOTools.ReadMap(IOTools.open_file(options.filename_contigs)) data = list(zip(*list(contigs.items()))) referencenames, referencelengths = data[0], list(map(int, data[1])) else: raise ValueError( "please provide either --template-bam-file or --contigs-tsv-file") infile = pysam.AlignmentFile("-", "rb") outfile = pysam.AlignmentFile("-", "wb", template=genomefile, referencenames=referencenames, referencelengths=referencelengths) if options.colour_mismatches: tag = "CM" else: tag = "NM" nambiguous = 0 ninput = 0 nunmapped = 0 ncigar = 0 nfull = 0 noutput = 0 contig2tid = dict([(y, x) for x, y in enumerate(outfile.references)]) for qname, readgroup in itertools.groupby(infile, lambda x: x.qname): ninput += 1 reads = list(readgroup) if reads[0].is_unmapped: nunmapped += 1 continue # filter for best match best = min([x.opt(tag) for x in reads]) reads = [x for x in reads if x.opt(tag) == best] if len(reads) > 1: nambiguous += 1 continue read = reads[0] # reject complicated matches (indels, etc) # to simplify calculations below. if len(read.cigar) > 1: ncigar += 1 continue # set NH flag to latest count t = dict(read.tags) t['NH'] = 1 read.tags = list(t.items()) sname = infile.getrname(read.tid) contig, first_exon_start, middle, last_exon_end, splice, strand = sname.split( "|") first_exon_end, last_exon_start = middle.split("-") first_exon_start, first_exon_end, last_exon_start, last_exon_end = list( map(int, (first_exon_start, first_exon_end, last_exon_start, last_exon_end))) first_exon_end += 1 total = first_exon_end - first_exon_start + \ last_exon_end - last_exon_start first_exon_length = first_exon_end - first_exon_start match1 = first_exon_length - read.pos intron_length = last_exon_start - first_exon_end match2 = read.qlen - match1 # match lies fully in one exon - ignore if match1 <= 0 or match2 <= 0: nfull += 1 continue # increment pos read.pos = first_exon_start + read.pos read.tid = contig2tid[contig] # 3 = BAM_CREF_SKIP read.cigar = [(0, match1), (3, intron_length), (0, match2)] outfile.write(read) noutput += 1 outfile.close() if genomefile: genomefile.close() c = E.Counter() c.input = ninput c.output = noutput c.full = nfull c.cigar = ncigar c.ambiguous = nambiguous c.unmapped = nunmapped E.info("%s" % str(c)) # write footer and output benchmark information. E.stop()
def loadIntervals(infile, outfile): '''load intervals from :term:`bed` formatted files into the database. If a :term:`bam` file is associated with a :term:`bed` file, re-evaluate the intervals by counting reads within the interval. In contrast to the initial pipeline, the genome is not binned. nprobes: number of reads in interval peakcenter: position with maximum number of reads in interval avgval: average coverage within interval ''' tmpfile = P.get_temp_file(".") headers = ("avgval", "disttostart", "genelist", "length", "peakcenter", "peakval", "position", "interval_id", "npeaks", "nprobes", "contig", "start", "end", "score", "strand") tmpfile.write("\t".join(headers) + "\n") (avgval, contig, disttostart, end, genelist, length, peakcenter, peakval, position, start, interval_id, npeaks, nprobes) = \ 0, "", 0, 0, "", 0, 0, 0, 0, 0, 0, 0, 0 track = Sample(filename=P.snip(infile, ".bed.gz")) bamfiles, offsets = getAssociatedBAMFiles(track) if bamfiles: E.info("%s: associated bamfiles = %s" % (track, bamfiles)) else: E.info("%s: no bamfiles associated" % (track)) # open all bamfiles samfiles = [pysam.Samfile(fn, "rb") for fn in bamfiles] c = E.Counter() # count tags for bed in Bed.iterator(IOTools.open_file(infile, "r")): c.input += 1 if "name" not in bed: bed.name = c.input try: strand = bed["strand"] except IndexError: strand = "." # The fifth field of a bed file can be used to supply a # score. Our iterator returns the optional fields as a "fields # array". The first of these is the interval name, and the # second the score. The score may be more is better or less is # better. if len(bed.fields) > 1: value = bed.fields[1] if value != "": score = value else: score = 1 else: score = 1 if samfiles: npeaks, peakcenter, length, avgval, peakval, nprobes = \ PipelinePeakcalling.countPeaks( bed.contig, bed.start, bed.end, samfiles, offsets) if nprobes == 0: c.skipped_reads += 1 else: # deal with bed12 bed_intervals = bed.toIntervals() length = sum([e - s for s, e in bed_intervals]) mid_point = length / 2 for s, e in bed_intervals: peakcenter = s + mid_point if peakcenter >= e: mid_point = peakcenter - e else: break npeaks, avgval, peakval, nprobes = \ (1, 1, 1, 1) c.output += 1 tmpfile.write("\t".join(map( str, (avgval, disttostart, genelist, length, peakcenter, peakval, position, bed.name, npeaks, nprobes, bed.contig, bed.start, bed.end, score, strand))) + "\n") if c.output == 0: E.warn("%s - no aggregate intervals") tmpfile.close() P.load(tmpfile.name, outfile, tablename=os.path.basename("%s_intervals" % track.asTable()), options="--allow-empty-file " "--add-index=interval_id") os.unlink(tmpfile.name) E.info("%s\n" % str(c))
def createGOFromGeneOntology(infile, outfile): """get GO assignments from Geneontology.org GO terms are mapped to ensembl gene names via uniprot identifiers. Configuration ------------- geneontology_file Filename on geneontology database, e.g., gene_association.goa_human.gz database_name Pipeline database name Arguments --------- infile : string Unused outfile : string Output filename """ filename = os.path.join(os.path.dirname(outfile), "geneontology.goa.gz") if not os.path.exists(filename): statement = ''' wget -O %(filename)s http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/%(go_geneontology_file)s?rev=HEAD ''' P.run(statement) # see http://www.geneontology.org/gene-associations/readme/goa.README Data = collections.namedtuple( "Data", "db db_object_id db_object_symbol qualifier goid dbreference evidence " " with_id aspect " " db_object_name synonym db_object_type " " taxon_id date assigned_by " " annotation_extension" " gene_product_form_id") dbh = sqlite3.connect(PARAMS["database_name"]) cc = dbh.cursor() map_uniprot2ensembl = dict( cc.execute("SELECT DISTINCT gene_name, gene_id FROM transcript_info"). fetchall()) map_goid2description = dict( cc.execute("SELECT DISTINCT go_id, description FROM go_assignments"). fetchall()) aspect2name = { "P": "biol_process", "F": "mol_function", "C": "cell_location" } c = E.Counter() found_uniprot, found_genes, notfound_uniprot = set(), set(), set() outf = IOTools.open_file(outfile, "w") outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n") for line in IOTools.open_file(filename): if line.startswith("!"): continue c.input += 1 data = Data._make(line[:-1].split("\t")) if data.db_object_symbol in map_uniprot2ensembl: gene_id = map_uniprot2ensembl[data.db_object_symbol] found_uniprot.add(data.db_object_symbol) found_genes.add(gene_id) outf.write( "%s\t%s\t%s\t%s\t%s\n" % (aspect2name[data.aspect], gene_id, data.goid, map_goid2description.get(data.goid, ""), data.evidence)) c.output += 1 else: c.notfound += 1 notfound_uniprot.add(data.db_object_symbol) c.found_genes = len(found_genes) c.found_uniprot = len(found_uniprot) c.notfound_uniprot = len(notfound_uniprot) E.info("%s" % str(c)) E.info("not found=%s" % str(notfound_uniprot)) outf.close()
def imputeGO(infile_go, infile_paths, outfile): """impute GO accessions. Output a list of gene-to-GO associations for genes that includes ancestral terms. Arguments --------- infile_go : string Filename with gene-to-GO assocations for genes infile_paths : string Filename with paths of term to ancestor (see go2fmt.pl). outfile : string Output filename """ c = E.Counter() term2ancestors = collections.defaultdict(set) with IOTools.open_file(infile_paths) as inf: for line in inf: parts = line[:-1].split() term = parts[0] ancestors = [parts[x] for x in range(2, len(parts), 2)] # there can be multiple paths term2ancestors[term].update(ancestors) goid2description = {} gene2goids = collections.defaultdict(list) goid2type = {} with IOTools.open_file(infile_go) as inf: for line in inf: if line.startswith("go_type"): continue go_type, gene_id, goid, description, evidence = line[:-1].split( "\t") gene2goids[gene_id].append(goid) goid2description[goid] = description goid2type[goid] = go_type outf = IOTools.open_file(outfile, "w ") for gene_id, in_goids in gene2goids.items(): c.genes += 1 out_goids = set(in_goids) for goid in in_goids: out_goids.update(term2ancestors[goid]) if len(in_goids) != len(out_goids): c.increased += 1 else: c.complete += 1 for goid in out_goids: outf.write("\t".join((goid2type.get(goid, ""), gene_id, goid, goid2description.get(goid, ""), "NA")) + "\n") c.assocations += 1 outf.close() E.info("%s" % str(c))
def __init__(self, infile, *args, **kwargs): self.gff = pysam.tabix_iterator(IOTools.open_file(infile), parser=pysam.asGFF3())
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("--library-source", dest="library_source", type="string", help="supply help") parser.add_option("--library-selection", dest="library_selection", type="string", help="supply help") parser.add_option("--tax-identifier", dest="tax_id", type="int", help="supply help") parser.set_defaults(library_source=None, library_selection=None, tax_id=9606) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) # tree = ET.parse('/ifs/home/andreas/ena.xml') # root = tree.getroot() # for study in root.findall("STUDY"): # alias = study.attrib["alias"] # center_name = study.attrib["center_name"] # accession = study.attrib["accession"] # try: # description = study.find("*/STUDY_DESCRIPTION").text # description = description.encode('ascii', 'ignore') # except AttributeError: # description = "" # options.stdout.write( "\t".join( (alias, # accession, # center_name, # description ) ) + "\n") # query_url = "http://www.ebi.ac.uk/ena/data/warehouse/search?query=%22tax_eq%289606%29%20AND%20library_source=%22TRANSCRIPTOMIC%22%20AND%20%28instrument_model=%22Illumina%20Genome%20Analyzer%20II%22%20OR%20instrument_model=%22Illumina%20Genome%20Analyzer%22%20OR%20instrument_model=%22Illumina%20Genome%20Analyzer%20IIx%22%20OR%20instrument_model=%22Illumina%20HiScanSQ%22%20OR%20instrument_model=%22Illumina%20HiSeq%201000%22%20OR%20instrument_model=%22Illumina%20HiSeq%202000%22%20OR%20instrument_model=%22Illumina%20HiSeq%202500%22%29%22&domain=read&download=txt" # query_url = "http://www.ebi.ac.uk/ena/data/view/search?query=%22tax_eq%289606%29%20AND%20library_source=%22TRANSCRIPTOMIC%22%20AND%20%28instrument_model=%22Illumina%20Genome%20Analyzer%20II%22%20OR%20instrument_model=%22Illumina%20Genome%20Analyzer%22%20OR%20instrument_model=%22Illumina%20Genome%20Analyzer%20IIx%22%20OR%20instrument_model=%22Illumina%20HiScanSQ%22%20OR%20instrument_model=%22Illumina%20HiSeq%201000%22%20OR%20instrument_model=%22Illumina%20HiSeq%202000%22%20OR%20instrument_model=%22Illumina%20HiSeq%202500%22%29%22&domain=read&download=txt" # query_url = "http://www.ebi.ac.uk/ena/data/warehouse/search?query=%22(instrument_model=%22Illumina%20HiSeq%202000%22%20OR%20instrument_model=%22Illumina%20HiSeq%201000%22%20OR%20instrument_model=%22Illumina%20HiSeq%202500%22)%20AND%20library_layout=%22PAIRED%22%20AND%20library_source=%22TRANSCRIPTOMIC%22%22&domain=read" # query_url = "http://www.ebi.ac.uk/ena/data/view/A00145&display=xml" query_url = "http://www.ebi.ac.uk/ena/data/warehouse/search" data_url = "http://www.ebi.ac.uk/ena/data/view" # params = None # query_url = "http://www.ebi.ac.uk/ena/data/view/DRP000011&display=xml" fields = [ 'base_count', 'read_count', 'instrument_model', 'scientific_name', 'library_layout', 'library_source', 'library_strategy', 'library_selection', 'experiment_accession', 'experiment_title', 'study_accession', 'study_title', 'first_public', 'submission_accession', 'center_name', ] query = 'tax_eq(%i) AND instrument_platform="ILLUMINA"' % (options.tax_id) if options.library_source: query += ' AND library_source="%s" ' % options.library_source if options.library_selection: query += ' AND library_selection="%s" ' % options.library_selection # collect pre-study results params = urlencode({ 'query': query, 'display': 'report', 'fields': ",".join(fields), 'result': 'read_run' }) E.debug("?".join((query_url, params))) lines = urlopen(query_url, params) header = lines.readline() fields.insert(0, 'run_accession') DATA = collections.namedtuple("DATA", fields) fields.append("read_length") fields.append("design") table_study = options.stdout # IOTools.open_file( "study.tsv", "w" ) table_study.write("\t".join(fields) + "\n") # collect a list of all studies studies = set() for line in lines: # line endings are \r\n for data, but only \n for header line = line[:-2] data = DATA(*line.split("\t")) try: read_length = float(data.base_count) / float(data.read_count) except ValueError: read_length = 0 if data.library_layout == "PAIRED": read_length /= 2.0 design = MAP_CODE2DESIGN.get( (data.library_selection, data.library_source), "other") table_study.write(line + "\t" + str(read_length) + "\t" + design + "\n") studies.add(data.study_accession) table_studies = IOTools.open_file("studies.tsv", "w") studies_fields = ["study_accession", "nreferences", "pubmed_ids"] table_studies.write("\t".join(studies_fields) + "\n") return # params = urllib.urlencode( { 'display' : 'xml' } ) # url = "/".join( ( data_url, 'SRP013999') ) + "&" + params # print urllib2.urlopen( url ).read() for study_accession in studies: # get additional info params = urlencode({'display': 'xml'}) url = "/".join((data_url, study_accession)) + "&" + params info_lines = urlopen(url) tree = ET.parse(info_lines) root = tree.getroot() pmids = [] for link in root.findall('*//XREF_LINK'): db = link.find('DB').text if db == "pubmed": pmids.append(link.find('ID').text) # get geo geos = [] for attribute in root.findall('*//STUDY_ATTRIBUTE'): if attribute.find('TAG').text == "GEO Accession": geos.append(attribute.find('VALUE').text) params = { 'dbfrom': 'gds', 'db': 'pubmed', } geo_pmids = [] for geo in geos: Entrez.email = "*****@*****.**" handle = Entrez.esearch(db="gds", retmax=1, term=geo) record = Entrez.read(handle) uids = record['IdList'] handle.close() for uid in uids: record = Entrez.read( Entrez.elink(dbfrom="gds", dbto="pubmed", id=uid)) linksets = record[0]["LinkSetDb"] if not linksets: continue assert len(linksets) == 1 for linksetdb in linksets: geo_pmids = [x['Id'] for x in linksetdb["Link"]] if not pmids: pmids = geo_pmids table_studies.write("\t".join( map(str, (study_accession, len(pmids), ",".join(pmids), len(geos), ",".join(geos)))) + "\n") # write footer and output benchmark information. E.stop()
def main(argv=None): parser = E.OptionParser( version= "%prog version: $Id: plot_histogram.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-l", "--plot-legend", dest="legend", type="string", help="legend for plot [default=%default].") parser.add_option("-t", "--title", dest="title", type="string", help="title for plot [default=%default].") parser.add_option( "-p", "--hardcopy", dest="hardcopy", type="string", help= "filename for hardcopy of plot. The extension defines the format. Known extensions are: 'emf, eps, jpeg, jpg, pdf, png, ps, raw, rgba, svg, svgz' [default=%default].", metavar="FILE") parser.add_option("", "--xrange", dest="xrange", type="string", help="x viewing range of plot [default=%default].") parser.add_option("", "--yrange", dest="yrange", type="string", help="y viewing range of plot[default=%default].") parser.add_option("-o", "--logscale", dest="logscale", type="string", help="use logscale on x, y or xy [default=%default]") parser.add_option("-x", "--xtitle", dest="xtitle", type="string", help="title for x axis [default=%default]") parser.add_option("-y", "--ytitle", dest="ytitle", type="string", help="title for y axis [default=%default]") parser.add_option("-d", "--dpi", dest="dpi", type="int", help="dpi of images [default=%default]") parser.add_option("-n", "--normalize", dest="normalize", action="store_true", help="normalize histograms [default=%default]") parser.add_option( "--cumulate", dest="cumulate", action="store_true", help="calculate cumulative histogram [default=%default].") parser.add_option( "--reverse-cumulate", dest="reverse_cumulate", action="store_true", help= "calculate cumulative histogram in reverse order [default=%default].") parser.add_option("--legend-location", dest="legend_location", type="choice", choices=("upper left", "upper right", "lower left", "lower right", "center", "center right", "center left", "none"), help="location of legend [default=%default]") parser.add_option("--backend", dest="backend", type="string", help="backend to use [Agg|SVG|PS] [default=%default]") parser.add_option( "--symbols", dest="symbols", type="string", help="symbols to use for each histogram [steps|...] [default=%default]." ) parser.add_option("--dump", dest="dump", action="store_true", help="dump data for debug purposes [default=%default].") parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to use for plotting [default=%default].") parser.add_option( "--truncate", dest="truncate", action="store_true", help= "truncate date within x range. If not set, xrange is simply a viewing range [default=%default]." ) parser.add_option("--as-lines", dest="as_lines", action="store_true", help="plot only lines, no symbols [default=%default].") parser.add_option( "--noheaders", dest="headers", action="store_false", help="do not take first input line as header [default=%default].") parser.add_option("--stacked", dest="stacked", action="store_true", help="do a stacked plot [default=%default].") parser.add_option("--add-function", dest="function", type="string", help="add a function to the plot [default=%default].") parser.add_option( "--add-error-bars", dest="error_bars", type="choice", choices=("interleaved", "blocked"), help= "add error bars. The input format is 'interleaved' or 'blocked'. In the interleaved format the error follows each column. I the blocked format first the data, then the errors in the same order [default=%default]." ) parser.set_defaults( legend=None, title=None, hardcopy=None, logscale=None, xtitle=None, ytitle=None, xrange=None, yrange=None, normalize=None, columns="all", headers=True, legend_location="upper right", backend="cairo", symbols="g-D,b-h,r-+,c-+,m-+,y-+,k-o,g-^,b-<,r->,c-D,m-h", dump=False, truncate=False, cumulate=False, reverse_cumulate=False, function=None, add_error_bars=None, as_lines=False, stacked=False, dpi=80, ) (options, args) = E.start(parser) # import matplotlib/pylab. Has to be done here # for batch scripts without GUI. import matplotlib if options.hardcopy: matplotlib.use("cairo") import pylab # put this method here (because it requires pylab) def doStackedPlot(data, legend): colors = [ "red", "blue", "green", "cyan", "magenta", "yellow", "brown", "silver", "purple", "lightyellow", "black", "ivory", "pink", "orange", "gray", "teal" ] ax = data[:, 0] xvals = numpy.concatenate((ax, ax[::-1])) y_top = numpy.zeros(len(ax)) min_y = min(data[:, 1:].flat) max_y = min_y new_legend, dummy_lines = [], [] for i in range(1, len(legend)): new_y_top = y_top + data[:, i] yvals = numpy.concatenate((new_y_top, y_top[::-1])) p = pylab.fill(xvals, yvals, colors[i % len(colors)]) y_top = new_y_top max_y = max(y_top) dummy_lines.append( pylab.plot(xvals, yvals, colors[i % len(colors)])) new_legend.append(legend[i]) if not options.xrange: options.xrange = min(data[:, 0]), max(data[:, 0]) if not options.yrange: options.yrange = 0, max_y return dummy_lines, new_legend if options.as_lines: options.symbols = [] for y in ("-", ":", "--"): for x in "gbrcmyk": options.symbols.append(y + x) else: options.symbols = options.symbols.split(",") if options.xrange: options.xrange = list(map(float, options.xrange.split(","))) if options.yrange: options.yrange = list(map(float, options.yrange.split(","))) # Added support for (inclusive) range format: "1,3,5,7-100" (Gerton # 13/12/06) if options.columns != "all": cols = [] for d in options.columns.split(','): colopts = d.split('-') if len(colopts) == 2: cols += list(range(int(colopts[0]), int(colopts[1]) + 1)) else: cols += [int(d) - 1] options.columns = cols if args: if args[0] == "-": infile = sys.stdin else: infile = IOTools.open_file(args[0], "r") else: infile = sys.stdin if options.truncate: xr = options.xrange else: xr = None data, legend = IOTools.readTable(infile, numeric_type=numpy.float, take=options.columns, headers=options.headers, truncate=xr) if infile != sys.stdin: infile.close() if len(data) == 0: # or data is None: E.info("empty table: no plot") E.stop() return nrows, ncols = data.shape # note: because of MA, iteration makes copy of slices # Solution: inplace edits. if options.cumulate: if options.add_error_bars: raise ValueError("can not add error bars to cumulative histogram") if data.mask.any(): # cumsum does not work with masked arrays, so do it manually for y in range(1, ncols): c = 0 for x in range(0, nrows): if not data.mask[x, y]: data[x, y] += c c = data[x, y] else: for x in range(1, ncols): data[:, x] = data[:, x].cumsum() elif options.reverse_cumulate: if options.add_error_bars: raise ValueError("can not add error bars to cumulative histogram") if data.mask.any(): l = [0] * ncols for x in range(nrows - 1, -1, -1): for y in range(1, ncols): if not data.mask[x, y]: data[x, y] += l[y] l[y] = data[x, y] else: l = [0] * ncols for x in range(nrows - 1, -1, -1): for y in range(1, ncols): data[x, y] += l[y] l[y] = data[x, y] if options.normalize: if options.add_error_bars: raise ValueError("can not add error bars to normalized histogram") if data.mask.any(): m = [0] * ncols for x in range(nrows): for y in range(1, ncols): if not data.mask[x, y]: m[y] = max(m[y], float(data[x, y])) for y in range(1, ncols): if m[y] == 0: m[y] = 1.0 for x in range(nrows): for y in range(1, ncols): data[x, y] = data[x, y] / m[y] else: for x in range(1, ncols): m = float(data[:, x].max()) data[:, x] /= m if options.legend: legend = options.legend.split(",") if options.dump: for d in data: print(d) if options.title: pylab.title(options.title) if options.xtitle: pylab.xlabel(options.xtitle) else: pylab.xlabel(legend[0]) if options.ytitle: pylab.ylabel(options.ytitle) lines = [] # use dummy_lines to workaround a bug in errorbars that # causes the line styles to be set incorrectly. dummy_lines = [] new_legend = [] if options.error_bars: if options.error_bars == "interleaved": step_size = 2 max_size = len(legend) elif options.error_bars == "blocked": step_size = 1 max_size = (len(legend) - 1) / 2 else: step_size = 1 max_size = len(legend) if options.stacked: dummy_lines, new_legend = doStackedPlot(data, legend) else: nplotted = 0 nskipped = 0 for x in range(1, max_size, step_size): s = options.symbols[nplotted % len(options.symbols)] yvals = data[:, x] xvals = numpy.ma.masked_array(data[:, 0], numpy.ma.getmask(yvals)) xvals = xvals.compressed() yvals = yvals.compressed() if len(xvals) == 0: E.warn("skipped empty column %i: %s" % (x, legend[x])) if options.error_bars == "interleaved": yerr = data[:, x + 1] yerr = yerr.compressed() else: yerr = None lines.append(pylab.errorbar(xvals, yvals, yerr=yerr, fmt=s)) dummy_lines.append(pylab.plot(xvals, yvals, s)) new_legend.append(legend[x]) nplotted += 1 E.info("nplotted=%i, nskipped=%i" % (nplotted, nskipped)) if len(lines) == 0: E.stop() return if options.legend_location != "none": pylab.figlegend(dummy_lines, new_legend, options.legend_location) if options.logscale: if "x" in options.logscale: pylab.gca().set_xscale('log') if "y" in options.logscale: pylab.gca().set_yscale('log') if options.xrange: pylab.xlim(options.xrange) if options.yrange: pylab.ylim(options.yrange) if options.function: xstart, xend = pylab.gca().get_xlim() increment = (xend - xstart) / 100.0 exec(("f = lambda x: %s" % options.function), locals()) xvals, yvals = [], [] for x in range(0, 100): xvals.append(xstart) yvals.append(f(xstart)) xstart += increment xvals.append(xstart) yvals.append(f(xstart)) pylab.plot(xvals, yvals) if options.hardcopy: pylab.savefig(os.path.expanduser(options.hardcopy), dpi=options.dpi) else: pylab.show() E.stop()
def read_fastq_screen(infiles, track_regex, sep="-"): """merge fastqscreen output into dataframes. Arguments --------- infiles : string Input filename with fastqscreen output. regex_track: string Regular expression to extract track name from filename. sep: char Separator for merging multiple capture groups in regex. Returns ------- multiple dataframes """ dfs, tracks, summaries = [], [], [] for infile in infiles: try: track = sep.join(re.search(track_regex, infile).groups()) except AttributeError: raise ValueError("regex {} did not match file {}".format( track_regex, infile)) with IOTools.open_file(infile) as inf: lines = inf.readlines() version, aligner, reads = re.search( "#Fastq_screen version: (\S+)\t#Aligner: (\S+)\t#Reads in subset: (\d+)\n", lines.pop(0)).groups() percent_no_hit = re.search("%Hit_no_genomes: (\S+)\n", lines.pop(-1)).groups()[0] summaries.append((version, aligner, reads, percent_no_hit)) records = [x[:-1].split("\t") for x in lines if x.strip()] df = pd.DataFrame.from_records(records[1:], columns=records[0]) df = df.rename( columns={ 'Genome': "genome", '#Reads_processed': "reads_processed", '#Unmapped': "reads_unmapped", '%Unmapped': "reads_unmapped_percent", '#One_hit_one_genome': "one_hit_one_genome", '%One_hit_one_genome': "one_hit_one_genome_percent", '#Multiple_hits_one_genome': "multiple_hits_one_genome", '%Multiple_hits_one_genome': "multiple_hits_one_genome_percent", '#One_hit_multiple_genomes': "one_hit_multiple_genomes", '%One_hit_multiple_genomes': "one_hit_multiple_genomes_percent", 'Multiple_hits_multiple_genomes': "multiple_hits_multiple_genomes", '%Multiple_hits_multiple_genomes': "multiple_hits_multiple_genomes" }) dfs.append(df) tracks.append(track) df_details = pd.concat(dfs, keys=tracks, names=["track"]) df_details.index = df_details.index.droplevel(1) df_summary = pd.DataFrame.from_records( summaries, columns=["version", "aligner", "nreads", "nohit_percent"], index=tracks) df_summary.index.name = "track" return df_summary, df_details
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option("-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option("-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option("--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option("--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--header-attributes", dest="header_attr", action="store_true", help="add GFF entry attributes to the FASTA record" " header section") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults( is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False, header_attr=False, ) (options, args) = E.start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with IOTools.open_file(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in list(e.keys()): intersector = bx.intervals.intersection.Intersecter() for start, end in e[contig]: intersector.add_interval(bx.intervals.Interval(start, end)) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = [x for x in ichunk if x.feature == feature] else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = { x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";") } name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find(start, end)] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise NotImplementedError("unimplemented") if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with, ) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) if options.header_attr: attributes = " ".join( [":".join([ax, ay]) for ax, ay in chunk[0].asDict().items()]) options.stdout.write( ">%s %s:%s:%s feature:%s %s\n%s\n" % (name, contig, strand, ";".join( ["%i-%i" % x for x in out]), chunk[0].feature, attributes, seq)) else: options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--inplace", dest="inplace", action="store_true", help="update option list in place. New options will" "be added to the list given by --options-tsv-file. " "Options will only be added, not removed [%default]") parser.add_option("--options-tsv-file", dest="tsv_file", type="string", help="existing table with options. Will be updated if " "--in-place is set [default]") parser.set_defaults(inplace=False, tsv_file=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) old_options = None if options.tsv_file: if not os.path.exists(options.tsv_file): raise OSError("filename %s not found, see --options-tsv-file" % options.tsv_file) old_options = pandas.read_csv( IOTools.open_file(options.tsv_file), sep="\t", index_col=0, ) old_options = old_options.fillna("") global ORIGINAL_START ORIGINAL_START = E.start all_options = collections.defaultdict(list) for label, expression in EXPRESSIONS: files = glob.glob(expression) files.sort() for f in files: E.debug("processing %s" % f) if os.path.isdir(f): continue if os.path.basename(f) in EXCLUDE: continue collected_options = collectOptionsFromScript(os.path.abspath(f)) for o in collected_options: all_options[o].append(f) # add old options for x in old_options.index: if x not in all_options: all_options[x].append("--") if options.inplace: outfile = IOTools.open_file(options.tsv_file, "w") E.info("updating file '%s'" % options.tsv_file) else: outfile = options.stdout outfile.write("option\taction\tcomment\talternative\tfiles\n") for o, v in sorted(all_options.items()): try: action, comment, alternative, ff = old_options.xs(o) except KeyError: action, comment, alternative, ff = "", "", "", "" if comment == "nan": comment = "" if alternative == "nan": alternative = "" outfile.write("\t".join( (list(map(str, (o, action, comment, alternative, ",".join(v)))))) + "\n") if outfile != options.stdout: outfile.close() # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--tag-tsv-file", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.add_option("-d", "--design-tsv-file", dest="input_filename_design", type="string", help="input file with experimental design " "[default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("ttest", "sleuth", "edger", "deseq2", "mock", "dexseq"), help="differential expression method to apply " "[default=%default].") parser.add_option("--deseq2-dispersion-method", dest="deseq2_dispersion_method", type="choice", choices=("pooled", "per-condition", "blind"), help="dispersion method for deseq2 [default=%default].") parser.add_option("--deseq2-fit-type", dest="deseq2_fit_type", type="choice", choices=("parametric", "local"), help="fit type for deseq2 [default=%default].") parser.add_option("--edger-dispersion", dest="edger_dispersion", type="float", help="dispersion value for edgeR if there are no " "replicates [default=%default].") parser.add_option("-f", "--fdr", dest="fdr", type="float", help="fdr to apply [default=%default].") # currently not implemented # parser.add_option("-R", "--output-R-code", dest="save_r_environment", # type="string", # help="save R environment to loc [default=%default]") parser.add_option("-r", "--reference-group", dest="ref_group", type="string", help="Group to use as reference to compute " "fold changes against [default=$default]") parser.add_option("--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this " "number of counts in total [default=%default].") parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this number [default=%default].") parser.add_option("--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with " "lowest total counts [default=%default].") parser.add_option("--model", dest="model", type="string", help=("model for GLM")) parser.add_option("--reduced-model", dest="reduced_model", type="string", help=("reduced model for LRT")) parser.add_option("--contrast", dest="contrast", type="string", help=("contrast for differential expression testing")) parser.add_option("--sleuth-counts-dir", dest="sleuth_counts_dir", type="string", help=("directory containing expression estimates" "from sleuth. Sleuth expects counts" "files to be called abundance.h5")) parser.add_option("--dexseq-counts-dir", dest="dexseq_counts_dir", type="string", help=("directory containing counts for dexseq. DEXSeq " "expects counts files to be called .txt and" "to be generated by the DEXSeq_counts.py script")) parser.add_option("--dexseq-flattened-file", dest="dexseq_flattened_file", type="string", help=("directory containing flat gtf for dexseq. DEXSeq " "expects this to be generated by the" "DEXSeq_prepare_annotations.py script")) parser.add_option( "--outfile-sleuth-count", dest="outfile_sleuth_count", type="string", help=("outfile for full count table generated by sleuth")) parser.add_option("--outfile-sleuth-tpm", dest="outfile_sleuth_tpm", type="string", help=("outfile for full tpm table generated by sleuth")) parser.add_option("--use-ihw", dest="use_ihw", action="store_true", help=("use the independent hypothesis weighting method " "to obtain weighted FDR")) parser.add_option( "--sleuth-genewise", dest="sleuth_genewise", action="store_true", help=("run genewise, rather than transcript level testing")) parser.add_option("--gene-biomart", dest="gene_biomart", type="string", help=("name of ensemble gene biomart")) parser.add_option("--de-test", dest="DEtest", type="choice", choices=("wald", "lrt"), help=("Differential expression test")) parser.add_option("--Rhistory", dest="Rhistory", type="string", help=("Outfile for R history")) parser.add_option("--Rimage", dest="Rimage", type="string", help=("Outfile for R image")) parser.set_defaults(input_filename_tags="-", input_filename_design=None, output_filename=sys.stdout, method="deseq2", fdr=0.1, deseq2_dispersion_method="pooled", deseq2_fit_type="parametric", edger_dispersion=0.4, ref_group=False, filter_min_counts_per_row=None, filter_min_counts_per_sample=None, filter_percentile_rowsums=None, spike_foldchange_max=4.0, spike_expression_max=5.0, spike_expression_bin_width=0.5, spike_foldchange_bin_width=0.5, spike_max_counts_per_bin=50, model=None, contrast=None, output_filename_pattern=None, sleuth_counts_dir=None, dexseq_counts_dir=None, dexseq_flattened_file=None, outfile_sleuth_count=None, outfile_sleuth_tpm=None, use_ihw=False, sleuth_genewise=False, gene_biomart=None, DEtest="wald", reduced_model=None, Rhistory=None, Rimage=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) RH = None if options.Rhistory or options.Rimage: RH = R.R_with_History() outfile_prefix = options.output_filename_pattern # Expression.py currently expects a refernce group for edgeR and # sleuth, regardless of which test is used if not options.ref_group and (options.method is "edger" or options.method is "sleuth"): raise ValueError( "Must provide a reference group ('--reference-group')") # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.open_file(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) if len(set(design.table[options.contrast])) > 2: if options.method == "deseq2" or options.method == "sleuth": if options.DEtest == "wald": raise ValueError( "Factor must have exactly two levels for Wald Test. " "If you have more than two levels in your factor, " "consider LRT") else: E.info('''There are more than 2 levels for the contrast specified" "(%s:%s). The log2fold changes in the results table and MA plots will be for the first two levels in the contrast. The p-value will be the p-value for the overall significance of the contrast. Hence, some genes will have a signficant p-value but 0-fold change between the first two levels''' % (options.contrast, set(design[options.contrast]))) # Sleuth reads in data itself so we don't need to create a counts object if options.method == "sleuth": assert options.sleuth_counts_dir, ( "need to specify the location of the abundance.h5 counts files " " (--sleuth-counts-dir)") # validate design against counts and model design.validate(model=options.model) experiment = Expression.DEExperiment_Sleuth() results = experiment.run(design, base_dir=options.sleuth_counts_dir, model=options.model, contrast=options.contrast, outfile_prefix=outfile_prefix, counts=options.outfile_sleuth_count, tpm=options.outfile_sleuth_tpm, fdr=options.fdr, genewise=options.sleuth_genewise, gene_biomart=options.gene_biomart, DE_test=options.DEtest, ref_group=options.ref_group, reduced_model=options.reduced_model) # DEXSeq reads in data itself elif options.method == "dexseq": assert options.dexseq_counts_dir, ( "need to specify the location of the .txt counts files") # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.open_file(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model # design.validate(model=options.model) experiment = Expression.DEExperiment_DEXSeq() results = experiment.run(design, base_dir=options.dexseq_counts_dir, model=options.model, contrast=options.contrast, ref_group=options.ref_group, outfile_prefix=outfile_prefix, flattenedfile=options.dexseq_flattened_file, fdr=options.fdr) else: # create Counts object if options.input_filename_tags == "-": counts = Counts.Counts( pd.io.parsers.read_csv(sys.stdin, sep="\t", index_col=0, comment="#")) else: counts = Counts.Counts( pd.io.parsers.read_csv(IOTools.open_file( options.input_filename_tags, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model design.validate(counts, options.model) # restrict counts to samples in design table counts.restrict(design) # remove sample with low counts if options.filter_min_counts_per_sample: counts.removeSamples( min_counts_per_sample=options.filter_min_counts_per_sample) # remove observations with low counts if options.filter_min_counts_per_row: counts.removeObservationsFreq( min_counts_per_row=options.filter_min_counts_per_row) # remove bottom percentile of observations if options.filter_percentile_rowsums: counts.removeObservationsPerc( percentile_rowsums=options.filter_percentile_rowsums) # check samples are the same in counts and design following counts # filtering and, if not, restrict design table and re-validate design.revalidate(counts, options.model) # set up experiment and run tests if options.method == "ttest": experiment = Expression.DEExperiment_TTest() results = experiment.run(counts, design) elif options.method == "edger": experiment = Expression.DEExperiment_edgeR() results = experiment.run(counts, design, model=options.model, contrast=options.contrast, outfile_prefix=outfile_prefix, ref_group=options.ref_group, fdr=options.fdr, dispersion=options.edger_dispersion) elif options.method == "deseq2": experiment = Expression.DEExperiment_DESeq2() results = experiment.run(counts, design, model=options.model, contrast=options.contrast, outfile_prefix=outfile_prefix, fdr=options.fdr, fit_type=options.deseq2_fit_type, ref_group=options.ref_group, DEtest=options.DEtest, R=RH) results.getResults(fdr=options.fdr) if options.use_ihw: results.calculateIHW(alpha=options.fdr) for contrast in set(results.table['contrast']): results.plotVolcano(contrast, outfile_prefix=outfile_prefix, R=RH) results.plotMA(contrast, outfile_prefix=outfile_prefix, R=RH) results.plotPvalueHist(contrast, outfile_prefix=outfile_prefix, R=RH) results.plotPvalueQQ(contrast, outfile_prefix=outfile_prefix, R=RH) results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False) results.summariseDEResults() # write out summary tables for each comparison/contrast for test_group in list(results.Summary.keys()): outf = IOTools.open_file( "_".join([outfile_prefix, test_group, "summary.tsv"]), "w") outf.write("category\tcounts\n%s\n" % results.Summary[test_group].asTable()) outf.close() if options.Rhistory: RH.saveHistory(options.Rhistory) if options.Rimage: RH.saveImage(options.Rimage) E.stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-w", "--weights-tsv-file", dest="filename_weights", type="string", help="filename with codon frequencies. Multiple filenames " "can be separated by comma.") parser.add_option("-s", "--section", dest="sections", type="choice", action="append", choices=("length", "sequence", "hid", "na", "aa", "cpg", "dn", "degeneracy", "gaps", "codons", "codon-usage", "codon-translator", "codon-bias"), help="which sections to output [%default]") parser.add_option( "-t", "--sequence-type", dest="seqtype", type="choice", choices=("na", "aa"), help="type of sequence: na=nucleotides, aa=amino acids [%default].") parser.add_option( "-e", "--regex-identifier", dest="regex_identifier", type="string", help="regular expression to extract identifier from fasta " "description line.") parser.add_option("--split-fasta-identifier", dest="split_id", action="store_true", help="split fasta description line (starting >) and use " "only text before first space") parser.add_option( "--add-total", dest="add_total", action="store_true", help="add a row with column totals at the end of the table" "[%default]") parser.set_defaults( filename_weights=None, pseudocounts=1, sections=[], regex_identifier="(.+)", seqtype="na", gap_chars='xXnN', split_id=False, add_total=False, ) (options, args) = E.start(parser, argv=argv) rx = re.compile(options.regex_identifier) reference_codons = [] if options.filename_weights: options.filename_weights = options.filename_weights.split(",") for filename in options.filename_weights: if filename == "uniform": reference_codons.append(Genomics.GetUniformCodonUsage()) else: reference_codons.append( IOTools.ReadMap(IOTools.open_file(filename, "r"), has_header=True, map_functions=(str, float))) # print codon table differences options.stdlog.write( "# Difference between supplied codon usage preferences.\n") for x in range(0, len(reference_codons)): for y in range(0, len(reference_codons)): if x == y: continue # calculate KL distance a = reference_codons[x] b = reference_codons[y] d = 0 for codon, p in list(a.items()): if Genomics.IsStopCodon(codon): continue d += b[codon] * math.log(b[codon] / p) options.stdlog.write("# tablediff\t%s\t%s\t%f\n" % (options.filename_weights[x], options.filename_weights[y], d)) iterator = FastaIterator.FastaIterator(options.stdin) def getCounter(section): if options.seqtype == "na": if section == "length": s = SequenceProperties.SequencePropertiesLength() elif section == "sequence": s = SequenceProperties.SequencePropertiesSequence() elif section == "hid": s = SequenceProperties.SequencePropertiesHid() elif section == "na": s = SequenceProperties.SequencePropertiesNA() elif section == "gaps": s = SequenceProperties.SequencePropertiesGaps( options.gap_chars) elif section == "cpg": s = SequenceProperties.SequencePropertiesCpg() elif section == "dn": s = SequenceProperties.SequencePropertiesDN() # these sections requires sequence length to be a multiple of 3 elif section == "aa": s = SequenceProperties.SequencePropertiesAA() elif section == "degeneracy": s = SequenceProperties.SequencePropertiesDegeneracy() elif section == "codon-bias": s = SequenceProperties.SequencePropertiesBias(reference_codons) elif section == "codons": s = SequenceProperties.SequencePropertiesCodons() elif section == "codon-usage": s = SequenceProperties.SequencePropertiesCodonUsage() elif section == "codon-translator": s = SequenceProperties.SequencePropertiesCodonTranslator() else: raise ValueError("unknown section %s" % section) elif options.seqtype == "aa": if section == "length": s = SequenceProperties.SequencePropertiesLength() elif section == "sequence": s = SequenceProperties.SequencePropertiesSequence() elif section == "hid": s = SequenceProperties.SequencePropertiesHid() elif section == "aa": s = SequenceProperties.SequencePropertiesAminoAcids() else: raise ValueError("unknown section %s" % section) return s # setup totals totals = {} for section in options.sections: totals[section] = getCounter(section) options.stdout.write("id") for section in options.sections: options.stdout.write("\t" + "\t".join(totals[section].getHeaders())) options.stdout.write("\n") options.stdout.flush() s = getCounter("hid") s.loadSequence("AAAAAAAAA", "na") for cur_record in iterator: sequence = re.sub(" ", "", cur_record.sequence).upper() if len(sequence) == 0: raise ValueError("empty sequence %s" % cur_record.title) id = rx.search(cur_record.title).groups()[0] if options.split_id is True: options.stdout.write("%s" % id.split()[0]) else: options.stdout.write("%s" % id) options.stdout.flush() for section in options.sections: s = getCounter(section) s.loadSequence(sequence, options.seqtype) totals[section].addProperties(s) options.stdout.write("\t" + "\t".join(s.getFields())) options.stdout.write("\n") if options.add_total: options.stdout.write("total") for section in options.sections: options.stdout.write("\t" + "\t".join(totals[section].getFields())) options.stdout.write("\n") E.stop()
def __main__(): # use argparse to ignore unknown options parser = argparse.ArgumentParser() parser.add_argument("--version", action="version", version="%(prog)s") parser.add_argument("--wrapper-command", dest="command", type=str) parser.add_argument("--wrapper-bam-file", dest="bam_file", type=str) parser.add_argument("--wrapper-bam-option", dest="bam_option", type=str) parser.add_argument("--wrapper-bai-file", dest="bai_file", type=str) parser.add_argument( "--wrapper-dry-run", dest="dry_run", action="store_true") parser.add_argument("--wrapper-html-dir", dest="html_dir", type=str) parser.add_argument("--wrapper-html-file", dest="html_file", type=str) options, unknown = parser.parse_known_args() cgat = CGATBase(options) option_map = [] if options.bai_file or options.bam_file: if not (options.bai_file and options.bam_file): raise ValueError( "wrapper called with bam or bai file, but not both") if not options.bam_option: options.bam_option = "bam-file" tmp_fd, tmp_name = tempfile.mkstemp() tmp_bam_name = '%s.bam' % tmp_name tmp_bai_name = '%s.bai' % tmp_bam_name os.symlink(options.bam_file, tmp_bam_name) os.symlink(options.bai_file, tmp_bai_name) if options.bam_option.startswith("--"): # long option option_map.append("%s=%s" % (options.bam_option, tmp_bam_name)) else: # short option option_map.append("%s %s" % (options.bam_option, tmp_bam_name)) if options.html_dir: os.mkdir(options.html_dir) option_map.append("%s=%s/%%s" % ("--output-filename-pattern", options.html_dir)) statement = "python " + " ".join([options.command] + unknown + option_map) if options.dry_run: sys.stdout.write(statement + "\n") return else: cgat.runStatement(statement) if options.bai_file: os.unlink(tmp_bam_name) os.unlink(tmp_bai_name) if options.html_file: with IOTools.open_file(options.html_file, "w") as outf: outf.write('<h1>%s - Output</h1>' % os.path.basename(options.wrapper_command)) for fn in glob.glob(os.path.join(options.html_dir, "*.*")): dirname, basename = os.path.split(fn) outf.write('''<li><a href="%s">%s</a></li>\n''' % (basename, basename))
def writeSequencesForIntervals(track, filename, dbhandle, full=False, halfwidth=None, maxsize=None, proportion=None, masker=[], offset=0, shuffled=False, num_sequences=None, min_sequences=None, order="peakval", shift=None): '''build a sequence set for motif discovery. Intervals are taken from the table <track>_intervals in the database *dbhandle* and save to *filename* in :term:`fasta` format. If num_shuffles is set, shuffled copies are created as well with the shuffled number appended to the filename. The sequences are masked before shuffling (is this appropriate?) If *full* is set, the whole intervals will be output, otherwise only the region around the peak given by *halfwidth* If *maxsize* is set, the output is truncated at *maxsize* characters in order to create jobs that take too long. If proportion is set, only the top *proportion* intervals are output (sorted by peakval). If *num_sequences* is set, the first *num_sequences* will be used. *masker* can be a combination of * dust, dustmasker: apply dustmasker * softmask: mask softmasked genomic regions *order* is the order by which peaks should be sorted. Possible values are 'peakval' (peak value, descending order), 'score' (peak score, descending order) If *shift* is set, intervals will be shifted. ``leftright`` creates two intervals on the left and right of the actual interval. The intervals will be centered around the mid-point and truncated the same way as the main intervals. ''' fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) if order == "peakval": orderby = " ORDER BY peakval DESC" elif order == "max": orderby = " ORDER BY score DESC" else: raise ValueError( "Unknown value passed as order parameter, check your ini file") tablename = "%s_intervals" % P.tablequote(track) statement = '''SELECT contig, start, end, interval_id, peakcenter FROM %(tablename)s ''' % locals() + orderby cc = dbhandle.execute(statement) data = cc.fetchall() cc.close() if proportion: cutoff = int(len(data) * proportion) + 1 if min_sequences: cutoff = max(cutoff, min_sequences) elif num_sequences: cutoff = num_sequences else: cutoff = len(data) L.info( "writeSequencesForIntervals %s: using at most %i sequences for pattern finding" % (track, cutoff)) data = data[:cutoff] L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker))) fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) # modify the ranges if shift: if shift == "leftright": new_data = [(contig, start - (end - start), start, str(interval_id) + "_left", peakcenter) for contig, start, end, interval_id, peakcenter in data ] new_data.extend([ (contig, end, end + (end - start), str(interval_id) + "_right", peakcenter) for contig, start, end, interval_id, peakcenter in data ]) data = new_data if halfwidth: # center around peakcenter, add halfwidth on either side data = [(contig, peakcenter - halfwidth, peakcenter + halfwidth, interval_id) for contig, start, end, interval_id, peakcenter in data] else: # remove peakcenter data = [(contig, start, end, interval_id) for contig, start, end, interval_id, peakcenter in data] # get the sequences - cut at number of nucleotides sequences = [] current_size, nseq = 0, 0 new_data = [] for contig, start, end, interval_id in data: lcontig = fasta.getLength(contig) start, end = max(0, start + offset), min(end + offset, lcontig) if start >= end: L.info( "writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored" % (track, id, start, end, offset)) continue seq = fasta.getSequence(contig, "+", start, end) sequences.append(seq) new_data.append((start, end, interval_id, contig)) current_size += len(seq) if maxsize and current_size >= maxsize: L.info( "writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)" % (track, maxsize, nseq, len(data) - nseq)) break nseq += 1 data = new_data if shuffled: # note that shuffling is done on the unmasked sequences # Otherwise N's would be interspersed with real sequence # messing up motif finding unfairly. Instead, masking is # done on the shuffled sequence. sequences = [list(x) for x in sequences] for sequence in sequences: random.shuffle(sequence) sequences = maskSequences(["".join(x) for x in sequences], masker) c = E.Counter() outs = IOTools.open_file(filename, "w") for masker in masker: if masker not in ("unmasked", "none", None): sequences = maskSequences(sequences, masker) for sequence, d in zip(sequences, data): c.input += 1 if len(sequence) == 0: c.empty += 1 continue start, end, id, contig = d id = "%s_%s %s:%i-%i" % (track, str(id), contig, start, end) outs.write(">%s\n%s\n" % (id, sequence)) c.output += 1 outs.close() E.info("%s" % c) return c.output
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: vcfstats_sqlite.py 0001 2011-04-13 davids $", usage=globals()["__doc__"]) (options, args) = E.start(parser) options.filenames = args if len(options.filenames) < 1: options.stdout.write("# Error: no vcf-stats files specified/found.") sys.exit(1) E.info("Parsing %i file(s)" % len(options.filenames)) # set up output files vcf_file = IOTools.open_file('vcfstats.txt', 'w') indel_file = IOTools.open_file('indelstats.txt', 'w') snp_file = IOTools.open_file('snpstats.txt', 'w') shared_file = IOTools.open_file('sharedstats.txt', 'w') for fileno, filename in enumerate(options.filenames): prefix = os.path.basename(filename) trackname = prefix.replace(".vcfstats", "") if os.path.exists(filename): lines = [x for x in IOTools.open_file(filename, "r").readlines()] else: lines = [] if len(lines) == 0: options.stdout.write( "# Error: empty vcf-stats file found: $(filename)s") sys.exit(1) else: E.info("File %i contains %i lines" % (fileno, len(lines))) vcf_stats = dict(track=trackname) snp_stats = dict(track=trackname) indel_stats = dict() shared_stats = dict() all_vars = False indels = False snps = False shared = False for i, line in enumerate(lines): line = line.strip() if line.find("'all'") > -1: all_vars = True E.info("Found 'all'") continue if all_vars: if line.find("=>") > -1: fields = line.split("=>") key = fields[0].strip().replace("'", "").replace(">", "_") val = fields[1].strip().replace(",", "") else: key = "NA" val = "NA" if key == "indel" and val == "{": indels = True E.info("Found 'indels'") continue elif key == "snp" and val == "{": snps = True E.info("Found 'SNPs'") continue elif key == "shared" and val == "{": shared = True E.info("Found 'Shared'") continue if indels: if line.find("}") > -1: indels = False E.info("Processed 'indels'") continue else: indel_stats[key] = val elif snps: if line.find("}") > -1: snps = False E.info("Processed 'SNPs'") continue else: snp_stats[key] = val elif shared: if line.find("}") > -1: shared = False E.info("Processed 'Shared'") continue else: shared_stats[key] = val elif key != "NA": vcf_stats[key] = val # Ensure all keys are present allkeys = [ "nalt_1", "nalt_2", "nalt_3", "nalt_4", "nalt_5", "track", "count", "snp_count", "indel_count" ] for k in allkeys: if k in vcf_stats: continue else: vcf_stats[k] = "0" # Write header (for first file only) if filename == options.filenames[0]: # Ensure keys are sorted srt = list(vcf_stats.keys()) srt.sort() sep = "" for k in srt: vcf_file.write("%s%s" % (sep, k)) sep = "\t" vcf_file.write("\n") indel_file.write("track\tindel_length\tindel_count\n") shared_file.write("track\tno_samples\tvar_count\n") sep = "" for k in snp_stats.keys(): snp_file.write("%s%s" % (sep, k)) sep = "\t" snp_file.write("\n") # Write data sep = "" srt = list(vcf_stats.keys()) srt.sort() for k in srt: vcf_file.write("%s%s" % (sep, vcf_stats[k])) sep = "\t" vcf_file.write("\n") # Check all indel lengths are covered r = list(range(-20, 20, 1)) for i in r: if str(i) in indel_stats: continue else: indel_stats[i] = "0" for k in indel_stats.keys(): indel_file.write("%s\t%s\t%s\n" % (trackname, k, indel_stats[k])) for k in shared_stats.keys(): shared_file.write("%s\t%s\t%s\n" % (trackname, k, shared_stats[k])) sep = "" for k in snp_stats.keys(): snp_file.write("%s%s" % (sep, snp_stats[k])) sep = "\t" snp_file.write("\n") # close files vcf_file.close() indel_file.close() snp_file.close() E.stop() sys.exit(0)
def loadMAST(infile, outfile): '''parse mast file and load into database. Parse several motif runs and add them to the same table. Add columns for the control data as well. ''' tablename = P.to_table(outfile) tmpfile = P.get_temp_file(".") tmpfile.write(MAST.Match().header + "\tmotif\tcontig" "\tl_evalue\tl_pvalue\tl_nmatches\tl_length\tl_start\tl_end" "\tr_evalue\tr_pvalue\tr_nmatches\tr_length\tr_start\tr_end" "\tmin_evalue\tmin_pvalue\tmax_nmatches" + "\n") lines = IOTools.open_file(infile).readlines() chunks = [x for x in range(len(lines)) if lines[x].startswith("::")] chunks.append(len(lines)) def readChunk(lines, chunk): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.get_temp_file(".") try: motif, part = re.match(":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise ValueError("parsing error in line '%s'" % lines[chunks[chunk]]) E.info("reading %s - %s" % (motif, part)) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() mast = MAST.parse(IOTools.open_file(tmpfile2.name, "r")) os.unlink(tmpfile2.name) return motif, part, mast def splitId(s, mode): '''split background match id has three parts: track _ id _ pos track might contain '_'. ''' d = match.id.split("_") if mode == "bg": return "_".join(d[:-2]), d[-2], d[-1] elif mode == "fg": return "_".join(d[:-1]), d[-1] for chunk in range(0, len(chunks) - 1, 2): motif_fg, part, mast_fg = readChunk(lines, chunk) assert part == "foreground" motif_bg, part, mast_bg = readChunk(lines, chunk + 1) assert part == "background" assert motif_fg == motif_bg # index control data controls = collections.defaultdict(dict) for match in mast_bg.matches: track, id, pos = splitId(match.id, "bg") controls[id][pos] = (match.evalue, match.pvalue, match.nmotifs, match.length, match.start, match.end) for match in mast_fg.matches: # remove track and pos track, match.id = splitId(match.id, "fg") # move to genomic coordinates contig, start, end = re.match("(\S+):(\d+)..(\d+)", match.description).groups() if match.nmotifs > 0: start, end = int(start), int(end) match.start += start match.end += start match.positions = [x + start for x in match.positions] id = match.id if id not in controls: P.warn("no controls for %s - increase MAST evalue" % id) if "l" not in controls[id]: controls[id]["l"] = (float(P.get_params()["mast_evalue"]), 1, 0, 0, 0, 0) if "r" not in controls[id]: controls[id]["r"] = (float(P.get_params()["mast_evalue"]), 1, 0, 0, 0, 0) min_evalue = min(controls[id]["l"][0], controls[id]["r"][0]) min_pvalue = min(controls[id]["l"][1], controls[id]["r"][1]) max_nmatches = max(controls[id]["l"][2], controls[id]["r"][2]) tmpfile.write( str(match) + "\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( motif_fg, contig, "\t".join(map(str, controls[id]["l"])), "\t".join(map(str, controls[id]["r"])), str(min_evalue), str(min_pvalue), str(max_nmatches), ) + "\n") tmpfile.close() P.load(tmpfile.name, outfile, options="--add-index=id " "--add-index=motif " "--add-index=id,motif " "--allow-empty-file " "--map=base_qualities:text") os.unlink(tmpfile.name)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-r", "--mask-bed-file", dest="filename_rna", type="string", metavar='GFF', help="gff formatted file with masking locations. The number of " "reads overlapping the intervals in the given file will be " "computed. Note that the computation currently does not take " "into account indels, so it is an approximate count only. " "[%default]") parser.add_option( "-f", "--ignore-masked-reads", dest="remove_rna", action="store_true", help="as well as counting reads in the file given by --mask-bed-file, " "also remove these reads for duplicate and match statistics. " "[%default]") parser.add_option( "-i", "--num-reads", dest="input_reads", type="int", help="the number of reads - if given, used to provide percentages " "[%default]") parser.add_option( "-d", "--output-details", dest="output_details", action="store_true", help="output per-read details into a separate file. Read names are " "md5/base64 encoded [%default]") parser.add_option( "-q", "--fastq-file", dest="filename_fastq", help="filename with sequences and quality scores. This file is only " "used to collect sequence identifiers. Thus, for paired end data a " "single file is sufficient [%default]") parser.set_defaults( filename_rna=None, remove_rna=False, input_reads=0, force_output=False, filename_fastq=None, output_details=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if options.filename_rna: rna = GTF.readAndIndex( GTF.iterator(IOTools.open_file(options.filename_rna))) else: rna = None if len(args) > 0: pysam_in = pysam.AlignmentFile(args[0], "rb") elif options.stdin == sys.stdin: pysam_in = pysam.AlignmentFile("-", "rb") else: pysam_in = pysam.AlignmentFile(options.stdin, "rb") if options.output_details: outfile_details = E.openOutputFile("details", "w") else: outfile_details = None if options.filename_fastq and not os.path.exists(options.filename_fastq): raise IOError("file %s does not exist" % options.filename_fastq) (counter, flags_counts, nh_filtered, nh_all, nm_filtered, nm_all, mapq, mapq_all, max_hi) = \ _bam2stats.count(pysam_in, options.remove_rna, rna, filename_fastq=options.filename_fastq, outfile_details=outfile_details) if max_hi > 0 and max_hi != max(nh_all.keys()): E.warn("max_hi(%i) is inconsistent with max_nh (%i) " "- counts will be corrected" % (max_hi, max(nh_all.keys()))) outs = options.stdout outs.write("category\tcounts\tpercent\tof\n") def _write(outs, text, numerator, denominator, base): percent = IOTools.prettyPercent(numerator, denominator) outs.write('%s\t%i\t%s\t%s\n' % (text, numerator, percent, base)) ############################### ############################### ############################### # Output alignment information ############################### nalignments_unmapped = flags_counts["unmapped"] nalignments_mapped = counter.alignments_input - nalignments_unmapped _write(outs, "alignments_total", counter.alignments_input, counter.alignments_input, "alignments_total") if counter.alignments_input == 0: E.warn("no alignments in BAM file - no further output") E.stop() return _write(outs, "alignments_mapped", nalignments_mapped, counter.alignments_input, 'alignments_total') _write(outs, "alignments_unmapped", nalignments_unmapped, counter.alignments_input, 'alignments_total') if nalignments_mapped == 0: E.warn("no mapped alignments - no further output") E.stop() return for flag, counts in sorted(flags_counts.items()): if flag == "unmapped": continue _write(outs, 'alignments_' + flag, counts, nalignments_mapped, 'alignments_mapped') if options.filename_rna: _write(outs, "alignments_rna", counter.alignments_rna, nalignments_mapped, 'alignments_mapped') _write(outs, "alignments_no_rna", counter.alignments_no_rna, nalignments_mapped, 'alignments_mapped') _write(outs, "alignments_filtered", counter.alignments_filtered, nalignments_mapped, "alignments_mapped") if counter.filtered == nalignments_mapped: normby = "alignments_mapped" else: normby = "alignments_filtered" if counter.filtered > 0: _write(outs, "alignments_duplicates", counter.alignments_duplicates, counter.alignments_filtered, normby) _write(outs, "alignments_unique", counter.aligmnments_filtered - counter.alignments_duplicates, counter.alignments_filtered, normby) ############################### ############################### ############################### # Output read based information ############################### # derive the number of mapped reads in file from alignment counts if options.filename_fastq: nreads_total = counter.total_read _write(outs, "reads_total", counter.total_read, nreads_total, 'reads_total') _write(outs, "reads_unmapped", counter.total_read_is_unmapped, nreads_total, 'reads_total') _write(outs, "reads_mapped", counter.total_read_is_mapped, nreads_total, 'reads_total') _write(outs, "reads_missing", counter.total_read_is_missing, nreads_total, 'reads_total') _write(outs, "reads_mapped_unique", counter.total_read_is_mapped_uniq, counter.total_read_is_mapped, 'reads_mapped') _write(outs, "reads_multimapping", counter.total_read_is_mmap, counter.total_read_is_mapped, 'reads_mapped') else: E.warn('inferring read counts from alignments and NH tags') nreads_unmapped = flags_counts["unmapped"] nreads_mapped = computeMappedReadsFromAlignments( nalignments_mapped, nh_all, max_hi) nreads_missing = 0 if options.input_reads: nreads_total = options.input_reads # unmapped reads in bam file? if nreads_unmapped: nreads_missing = nreads_total - nreads_unmapped - nreads_mapped else: nreads_unmapped = nreads_total - nreads_mapped elif nreads_unmapped: # if unmapped reads are in bam file, take those nreads_total = nreads_mapped + nreads_unmapped else: # otherwise normalize by mapped reads nreads_unmapped = 0 nreads_total = nreads_mapped outs.write("reads_total\t%i\t%5.2f\treads_total\n" % (nreads_total, 100.0)) outs.write("reads_mapped\t%i\t%5.2f\treads_total\n" % (nreads_mapped, 100.0 * nreads_mapped / nreads_total)) outs.write("reads_unmapped\t%i\t%5.2f\treads_total\n" % (nreads_unmapped, 100.0 * nreads_unmapped / nreads_total)) outs.write("reads_missing\t%i\t%5.2f\treads_total\n" % (nreads_missing, 100.0 * nreads_missing / nreads_total)) if len(nh_all) > 1: outs.write("reads_unique\t%i\t%5.2f\treads_mapped\n" % (nh_all[1], 100.0 * nh_all[1] / nreads_mapped)) # compute after filtering # not that these are rough guesses if options.filename_rna: nreads_norna = computeMappedReadsFromAlignments( counter.filtered, nh_filtered, max_hi) _write(outs, "reads_norna", nreads_norna, nreads_mapped, "reads_mapped") if len(nh_filtered) > 1: _write(outs, "reads_norna_unique", nh_filtered[1], nreads_norna, "reads_mapped") pysam_in.close() ############################### ############################### ############################### # Output pair information ############################### if flags_counts["read2"] > 0: if options.filename_fastq: pairs_mapped = counter.total_pair_is_mapped # sanity check assert counter.total_pair_is_mapped == \ (counter.total_pair_is_proper_uniq + counter.total_pair_is_incomplete_uniq + counter.total_pair_is_incomplete_mmap + counter.total_pair_is_proper_duplicate + counter.total_pair_is_proper_mmap + counter.total_pair_not_proper_uniq + counter.total_pair_is_other) outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" % (counter.total_pairs, 100.0 * counter.total_pairs / counter.total_pairs)) outs.write( "pairs_mapped\t%i\t%5.2f\tpairs_total\n" % (pairs_mapped, 100.0 * pairs_mapped / counter.total_pairs)) outs.write("pairs_unmapped\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_unmapped, 100.0 * counter.total_pair_is_unmapped / counter.total_pairs)) outs.write( "pairs_proper_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_uniq, 100.0 * counter.total_pair_is_proper_uniq / counter.total_pairs)) outs.write( "pairs_incomplete_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_incomplete_uniq, 100.0 * counter.total_pair_is_incomplete_uniq / counter.total_pairs)) outs.write( "pairs_incomplete_multimapping\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_incomplete_mmap, 100.0 * counter.total_pair_is_incomplete_mmap / counter.total_pairs)) outs.write( "pairs_proper_duplicate\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_duplicate, 100.0 * counter.total_pair_is_proper_duplicate / counter.total_pairs)) outs.write( "pairs_proper_multimapping\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_mmap, 100.0 * counter.total_pair_is_proper_mmap / counter.total_pairs)) outs.write( "pairs_not_proper_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_not_proper_uniq, 100.0 * counter.total_pair_not_proper_uniq / counter.total_pairs)) outs.write("pairs_other\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_other, 100.0 * counter.total_pair_is_other / counter.total_pairs)) nread1_total = counter.total_read1 _write(outs, "read1_total", counter.total_read1, nread1_total, 'read1_total') _write(outs, "read1_unmapped", counter.total_read1_is_unmapped, nread1_total, 'read1_total') _write(outs, "read1_mapped", counter.total_read1_is_mapped, nread1_total, 'read1_total') _write(outs, "read1_mapped_unique", counter.total_read1_is_mapped_uniq, counter.total_read1_is_mapped, 'read1_mapped') _write(outs, "reads_multimapping", counter.total_read1_is_mmap, counter.total_read1_is_mapped, 'read1_mapped') _write(outs, "read1_missing", counter.total_read1_is_missing, counter.total_read1_is_mapped, 'read1_total') nread2_total = counter.total_read2 _write(outs, "read2_total", counter.total_read2, nread2_total, 'read2_total') _write(outs, "read2_unmapped", counter.total_read2_is_unmapped, nread2_total, 'read2_total') _write(outs, "read2_mapped", counter.total_read2_is_mapped, nread2_total, 'read2_total') _write(outs, "read2_mapped_unique", counter.total_read2_is_mapped_uniq, counter.total_read2_is_mapped, 'read2_mapped') _write(outs, "reads_multimapping", counter.total_read2_is_mmap, counter.total_read2_is_mapped, 'read2_mapped') _write(outs, "read2_missing", counter.total_read2_is_missing, counter.total_read2_is_mapped, 'read2_total') else: # approximate counts pairs_total = nreads_total // 2 pairs_mapped = flags_counts["proper_pair"] // 2 _write(outs, "pairs_total", pairs_total, pairs_total, "pairs_total") _write(outs, "pairs_mapped", pairs_mapped, pairs_total, "pairs_total") else: # no paired end data pairs_total = pairs_mapped = 0 outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" % (pairs_total, 0.0)) outs.write("pairs_mapped\t%i\t%5.2f\tpairs_total\n" % (pairs_mapped, 0.0)) if options.force_output or len(nm_filtered) > 0: outfile = E.openOutputFile("nm", "w") outfile.write("NM\talignments\n") if len(nm_filtered) > 0: for x in range(0, max(nm_filtered.keys()) + 1): outfile.write("%i\t%i\n" % (x, nm_filtered[x])) else: outfile.write("0\t%i\n" % (counter.filtered)) outfile.close() if options.force_output or len(nh_all) > 1: outfile = E.openOutputFile("nh_all", "w") outfile.write("NH\treads\n") if len(nh_all) > 0: writeNH(outfile, nh_all, max_hi) else: # assume all are unique if NH flag not set outfile.write("1\t%i\n" % (counter.mapped_reads)) outfile.close() if options.force_output or len(nh_filtered) > 1: outfile = E.openOutputFile("nh", "w") outfile.write("NH\treads\n") if len(nh_filtered) > 0: writeNH(outfile, nh_filtered, max_hi) else: # assume all are unique if NH flag not set outfile.write("1\t%i\n" % (counter.filtered)) outfile.close() if options.force_output or len(mapq_all) > 1: outfile = E.openOutputFile("mapq", "w") outfile.write("mapq\tall_reads\tfiltered_reads\n") for x in range(0, max(mapq_all.keys()) + 1): outfile.write("%i\t%i\t%i\n" % (x, mapq_all[x], mapq[x])) outfile.close() # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", action="store", choices=("hierarchy", "set-field", "set-pattern", "set-none"), help="Method to use for conversion") parser.add_option( "-g", "--gene-type", dest="gene_type", type="string", help="feature type to get gene_id from if possible [%default]") parser.add_option( "-t", "--transcript-type", dest="transcript_type", type="string", help="feature type to get transcript_id from if possible [%default]") parser.add_option( "-d", "--no-discard", dest="discard", action="store_false", help= "Do not discard feature types specified by GENE_TYPE and TRANSCRIPT_TYPE" ) parser.add_option( "--gene-id", dest="gene_field_or_pattern", type="string", help="Either field or pattern for the gene_id [%default]") parser.add_option( "--transcript-id", dest="transcript_field_or_pattern", type="string", help="Either field or pattern for the transcript_id [%default]") parser.add_option( "--parent-field", dest="parent", type="string", help="field that specifies the parent relationship. Currently only" "if left as Parent will features with multiple parents be parsed" "correctly" "") parser.add_option( "--read-twice", dest="read_twice", action="store_true", help= "Instead of holding the whole file in memory, read once for parsing the " "hierarchy, and then again for actaully doing the conversion. Means a real file " "and not a pipe must be provided." "") parser.add_option( "--by-chrom", dest="by_chrom", action="store_true", help="Parse input file one choromosome at a time. Reduces memory usage, " "but input must be sorted by chromosome and features may not split accross " " multiple chromosomes" "") parser.add_option( "--fail-missing-gene", dest="missing_gene", action="store_false", help="Fail if no feature of type GENE_TYPE is found instead of using " "defaulting to highest object in hierarchy" "") parser.set_defaults(method="hierarchy", gene_type="gene", transcript_type="mRNA", discard=True, gene_field_or_pattern="ID", transcript_field_or_pattern="ID", read_twice=False, by_chrom=False, missing_gene=True, parent="Parent") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) gffs = GFF3.flat_file_iterator(options.stdin) if options.by_chrom: gffs = GFF3.chrom_iterator(gffs) else: gffs = [gffs] # running early so that fails early if configuration is wrong if options.read_twice: # Will throw IOError if options.stdin is not a normal file second_gff = GFF3.flat_file_iterator( IOTools.open_file(options.stdin.name)) if options.by_chrom: second_gff = GFF3.chrom_iterator(second_gff) else: second_gff = iter([second_gff]) else: second_gff = None for chunk in gffs: if options.read_twice: second_gff_chunk = next(second_gff) else: chunk = list(chunk) second_gff_chunk = chunk if options.method == "hierarchy": convert_hierarchy(chunk, second_gff_chunk, options) elif options.method == "set-field": gene_id_pattern = "%%(%s)s" % options.gene_field_or_pattern transcript_id_pattern = "%%(%s)s" % options.transcript_field_or_pattern convert_set(chunk, gene_id_pattern, transcript_id_pattern, options) elif options.method == "set-pattern": convert_set(chunk, options.gene_field_or_pattern, options.transcript_field_or_pattern, options) elif options.method == "set-none": convert_set(chunk, None, None, options) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-d", "--design-tsv-file", dest="input_filename_design", type="string", help="input file with experimental design " "[default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("filter", "spike", "normalize"), help="differential expression method to apply " "[default=%default].") parser.add_option("--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this " "number of counts in total [default=%default].") parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this numer [default=%default].") parser.add_option("--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with " "lowest total counts [default=%default].") parser.add_option("--spike-change-bin-min", dest="min_cbin", type="float", help="minimum bin for change bins [default=%default].") parser.add_option("--spike-change-bin-max", dest="max_cbin", type="float", help="maximum bin for change bins [default=%default].") parser.add_option("--spike-change-bin-width", dest="width_cbin", type="float", help="bin width for change bins [default=%default].") parser.add_option("--spike-initial-bin-min", dest="min_ibin", type="float", help="minimum bin for initial bins[default=%default].") parser.add_option("--spike-initial-bin-max", dest="max_ibin", type="float", help="maximum bin for intitial bins[default=%default].") parser.add_option("--spike-initial-bin-width", dest="width_ibin", type="float", help="bin width intitial bins[default=%default].") parser.add_option( "--spike-minimum", dest="min_spike", type="int", help="minimum number of spike-ins required within each bin\ [default=%default].") parser.add_option( "--spike-maximum", dest="max_spike", type="int", help="maximum number of spike-ins allowed within each bin\ [default=%default].") parser.add_option("--spike-difference-method", dest="difference", type="choice", choices=("relative", "logfold", "abs_logfold"), help="method to use for calculating difference\ [default=%default].") parser.add_option("--spike-iterations", dest="iterations", type="int", help="number of iterations to generate spike-ins\ [default=%default].") parser.add_option("--spike-cluster-maximum-distance", dest="cluster_max_distance", type="int", help="maximum distance between adjacent loci in cluster\ [default=%default].") parser.add_option("--spike-cluster-minimum-size", dest="cluster_min_size", type="int", help="minimum number of loci required per cluster\ [default=%default].") parser.add_option("--spike-type", dest="spike_type", type="choice", choices=("row", "cluster"), help="spike in type [default=%default].") parser.add_option("--spike-subcluster-min-size", dest="min_sbin", type="int", help="minimum size of subcluster\ [default=%default].") parser.add_option("--spike-subcluster-max-size", dest="max_sbin", type="int", help="maximum size of subcluster\ [default=%default].") parser.add_option("--spike-subcluster-bin-width", dest="width_sbin", type="int", help="bin width for subcluster size\ [default=%default].") parser.add_option("--spike-output-method", dest="output_method", type="choice", choices=("append", "seperate"), help="defines whether the spike-ins should be appended\ to the original table or seperately [default=%default].") parser.add_option("--spike-shuffle-column-suffix", dest="shuffle_suffix", type="string", help="the suffix of the columns which are to be shuffled\ [default=%default].") parser.add_option("--spike-keep-column-suffix", dest="keep_suffix", type="string", help="a list of suffixes for the columns which are to be\ keep along with the shuffled columns[default=%default].") parser.add_option("--normalization-method", dest="normalization_method", type="choice", choices=("deseq-size-factors", "total-count", "total-column", "total-row"), help="normalization method to apply [%default]") parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.set_defaults(input_filename_tags="-", method="filter", filter_min_counts_per_row=None, filter_min_counts_per_sample=None, filter_percentile_rowsums=None, output_method="seperate", difference="logfold", spike_type="row", min_cbin=0, max_cbin=100, width_cbin=100, min_ibin=0, max_ibin=100, width_ibin=100, max_spike=100, min_spike=None, iterations=1, cluster_max_distance=100, cluster_min_size=10, min_sbin=1, max_sbin=1, width_sbin=1, shuffle_suffix=None, keep_suffix=None, normalization_method="deseq-size-factors") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) # load if options.keep_suffix: # if using suffix, loadTagDataPandas will throw an error as it # looks for column names which exactly match the design # "tracks" need to write function in Counts.py to handle # counts table and design table + suffix counts = pd.read_csv(options.stdin, sep="\t", comment="#") inf = IOTools.open_file(options.input_filename_design) design = pd.read_csv(inf, sep="\t", index_col=0) inf.close() design = design[design["include"] != 0] if options.method in ("filter", "spike"): if options.input_filename_design is None: raise ValueError("method '%s' requires a design file" % options.method) else: # create Counts object # TS if spike type is cluster, need to keep "contig" and "position" # columns out of index if options.spike_type == "cluster": index = None, else: index = 0 if options.input_filename_tags == "-": counts = Counts.Counts( pd.io.parsers.read_csv(options.stdin, sep="\t", index_col=index, comment="#")) else: counts = Counts.Counts(IOTools.open_file( options.input_filename_tags, "r"), sep="\t", index_col=index, comment="#") # TS normalization doesn't require a design table if not options.method == "normalize": assert options.input_filename_design and os.path.exists( options.input_filename_design) # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.open_file(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) if options.method == "filter": assert (options.filter_min_counts_per_sample is not None or options.filter_min_counts_per_row is not None or options.filter_percentile_rowsums is not None), \ "no filtering parameters have been suplied" # filter # remove sample with low counts if options.filter_min_counts_per_sample: counts.removeSamples( min_counts_per_sample=options.filter_min_counts_per_sample) # remove observations with low counts if options.filter_min_counts_per_row: counts.removeObservationsFreq( min_counts_per_row=options.filter_min_counts_per_row) # remove bottom percentile of observations if options.filter_percentile_rowsums: counts.removeObservationsPerc( percentile_rowsums=options.filter_percentile_rowsums) nobservations, nsamples = counts.table.shape if nobservations == 0: E.warn("no observations remaining after filtering- no output") return if nsamples == 0: E.warn("no samples remain after filtering - no output") return # write out counts.table.to_csv(options.stdout, sep="\t", header=True) elif options.method == "normalize": counts.normalise(method=options.normalization_method, row_title="total") # write out counts.table.to_csv(options.stdout, sep="\t", header=True) elif options.method == "spike": # check parameters are sensible and set parameters where they # are not explicitly set if not options.min_spike: E.info("setting minimum number of spikes per bin to equal" "maximum number of spikes per bin (%s)" % options.max_spike) options.min_spike = options.max_spike if options.spike_type == "cluster": assert options.max_sbin <= options.cluster_min_size, \ ("max size of subscluster: %s is greater than min size of" "cluster: %s" % (options.max_sbin, options.cluster_min_size)) counts_columns = set(counts.table.columns.values.tolist()) assert ("contig" in counts_columns and "position" in counts_columns), \ ("cluster analysis requires columns named 'contig' and" "'position' in the dataframe") counts.sort(sort_columns=["contig", "position"], reset_index=True) # restrict design table to first pair only design.firstPairOnly() # get dictionaries to map group members to column names # use different methods depending on whether suffixes are supplied if options.keep_suffix: g_to_keep_tracks, g_to_spike_tracks = design.mapGroupsSuffix( options.shuffle_suffix, options.keep_suffix) else: # if no suffixes supplied, spike and keep tracks are the same g_to_track = design.getGroups2Samples() g_to_spike_tracks, g_to_keep_tracks = (g_to_track, g_to_track) # set up numpy arrays for change and initial values change_bins = np.arange(options.min_cbin, options.max_cbin, options.width_cbin) initial_bins = np.arange(options.min_ibin, options.max_ibin, options.width_ibin) E.info("Column boundaries are: %s" % str(change_bins)) E.info("Row boundaries are: %s" % str(initial_bins)) # shuffle rows/clusters if options.spike_type == "cluster": E.info("looking for clusters...") clusters_dict = Counts.findClusters(counts_sort, options.cluster_max_distance, options.cluster_min_size, g_to_spike_tracks, groups) if len(clusters_dict) == 0: raise Exception("no clusters were found, check parameters") E.info("shuffling subcluster regions...") output_indices, counts = Counts.shuffleCluster( initial_bins, change_bins, g_to_spike_tracks, groups, options.difference, options.max_spike, options.iterations, clusters_dict, options.max_sbin, options.min_sbin, options.width_sbin) elif options.spike_type == "row": E.info("shuffling rows...") output_indices, bin_counts = counts.shuffleRows( options.min_cbin, options.max_cbin, options.width_cbin, options.min_ibin, options.max_ibin, options.width_ibin, g_to_spike_tracks, design.groups, options.difference, options.max_spike, options.iterations) filled_bins = Counts.thresholdBins(output_indices, bin_counts, options.min_spike) assert len(filled_bins) > 0, "No bins contained enough spike-ins" # write out counts.outputSpikes(filled_bins, g_to_keep_tracks, design.groups, output_method=options.output_method, spike_type=options.spike_type, min_cbin=options.min_cbin, width_cbin=options.width_cbin, max_cbin=options.max_cbin, min_ibin=options.min_ibin, width_ibin=options.width_ibin, max_ibin=options.max_ibin, min_sbin=options.min_sbin, width_sbin=options.width_sbin, max_sbin=options.max_sbin) E.stop()
def test_cmdline(): '''test style of scripts ''' # start script in order to build the command line parser global ORIGINAL_START if ORIGINAL_START is None: ORIGINAL_START = E.start # read the first two columns map_option2action = IOTools.read_map( IOTools.open_file(FILENAME_OPTIONLIST), columns=(0, 1), has_header=True) files = [] for label, expression in EXPRESSIONS: f = glob.glob(expression) files.extend(sorted(f)) files = filter_files(files) # make sure to use the current working directory as # primary lookup. sys.path.insert(0, ".") # files = [ # 'scripts/check_db.py', # 'scripts/cgat_build_report_page.py'] for f in files: if os.path.isdir(f): continue if os.path.basename(f) in EXCLUDE: continue script_name = os.path.abspath(f) pyxfile = (os.path.join(os.path.dirname(f), "_") + os.path.basename(f) + "x") fail_.description = script_name # check if script contains getopt with IOTools.open_file(script_name) as inf: if "getopt" in inf.read(): yield (fail_, "script uses getopt directly: %s" % script_name) continue module, modulename = load_script(script_name) if module is None: yield (fail_, "module could not be imported: %s\n" % script_name) continue E.start = LocalStart try: module.main(argv=["dummy", "--help"]) except AttributeError: yield (fail_, "no main method in %s\n" % script_name) ok_(False, "no main method in %s" % script_name) except SystemExit: yield (fail_, "script does not use E.start() %s\n" % script_name) except DummyError: pass for option in PARSER.option_list: # ignore options added by optparse if option.dest is None: continue optstring = option.get_opt_string() if optstring.startswith("--"): optstring = optstring[2:] check_option.description = script_name + ":" + optstring yield (check_option, optstring, os.path.abspath(f), map_option2action) # clear up del sys.modules[modulename] # scripts with pyximport need special handling. # # Multiple imports of pyximport seems to create # some confusion - here, clear up sys.meta_path after # each script if os.path.exists(pyxfile): sys.meta_path = []
def buildIndex(self, filename): return Bed.readAndIndex(IOTools.open_file(filename, "r"))