def printHistogram(values, section, options, min_value=0, increment=1.0): if len(values) == 0: if options.loglevel >= 1: options.stdlog.write( "# no histogram data for section %s\n" % (section)) return outfile = IOTools.openFile(options.output_filename_pattern % section, "w") h = Histogram.Calculate( values, no_empty_bins=True, min_value=0, increment=1.0) outfile.write("bin\t%s\n" % section) for bin, val in h: outfile.write("%5.2f\t%i\n" % (bin, val)) outfile.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/analyze_duplications.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("-s", "--species", dest="species", type="string", help="species to use.") parser.add_option("-p", "--prefix", dest="prefix", type="string", help="prefix to use for temporary files.") parser.add_option("-m", "--method", dest="method", type="string", help="method to use [counts|lists|hists|links].") parser.add_option("-o", "--filename-output", dest="filename_output", type="string", help="output filename.") parser.add_option("-f", "--functions", dest="functions", type="string", help="functions to grep [functional|pseudo|all].") parser.add_option("-l", "--locations", dest="locations", type="string", help="locations to grep [local|nojunk|all|...].") parser.add_option("-b", "--bin-size", dest="bin_size", type="string", help="bin size.") parser.add_option("-i", "--fit", dest="fit", type="string", help="fitting method [decay|power]") parser.add_option("--min-value", dest="min_value", type="float", help="minimum value for histogram.") parser.add_option("--max-value", dest="max_value", type="float", help="maximum value for histogram.") parser.add_option("--use-relative-height", dest="use_relative_height", action="store_true", help="use relative height values.") parser.add_option( "--reverse", dest="reverse", action="store_true", help="""reverse species. Histograms will show the age of duplications for duplicates in other genomes.""") parser.set_defaults(species="", functions="functional,pseudo,all", locations="local,nojunk,all", filename_output=None, bin_size=1.0, min_value=None, max_value=None, nonnull=None, use_relative_height=False, header=True, fit=None, reverse=False, method="counts") (options, args) = E.Start(parser, add_psql_options=True) options.species = options.species.split(",") options.locations = options.locations.split(",") options.functions = options.functions.split(",") if len(options.species) == 0: raise "please supply list of species." dbhandle = pgdb.connect(options.psql_connection) input_data = map(lambda x: x[:-1].split("\t"), filter(lambda x: x[0] != "#", sys.stdin.readlines())) ## remove header if options.header: del input_data[0] ## decide which columns to take ## 1st column: species1: this is the species in which duplications have occured. ## 2nd column: species2: this is the species with respect to which duplications occured. ## 3rd column: clusterid ## 4th column: chromosomes ## 5th column: function ## 6th column: height ## 7th column: relative height ## 8th column: locations ## 9th column: tree if options.use_relative_height: take = (0, 1, 2, 3, 4, 6, 7, 8) else: take = (0, 1, 2, 3, 4, 5, 7, 8) for x in range(len(input_data)): input_data[x] = tuple([input_data[x][y] for y in take]) map_pos2species = [] map_species2pos = {} for x in range(len(options.species)): map_species2pos[options.species[x]] = x map_pos2species.append(options.species[x]) outfile = None if options.method in ("counts", "medians"): if options.method == "counts": func = len elif options.method == "medians": func = numpy.median for location in options.locations: for function in options.functions: matrix = numpy.zeros( (len(options.species), len(options.species)), numpy.Float) data = GetSubset(input_data, location, function) ## sort by species1 and species2 data.sort() last_species1, last_species2 = None, None values = [] for species1, species2, cluster_id, l, f, height, locations, tree in data: if last_species1 != species1 or last_species2 != species2: if len(values) > 0: matrix[map_species2pos[last_species1], map_species2pos[last_species2]] = func( values) values = [] last_species1 = species1 last_species2 = species2 values.append(float(height)) if len(values) > 0: matrix[map_species2pos[last_species1], map_species2pos[last_species2]] = func(values) if options.filename_output: dict = {"f": function, "l": location} outfile = open(options.filename_output % dict, "w") else: outfile = sys.stdout outfile.write( "matrix for method %s: location: %s, function: %s\n" % (options.method, location, function)) if options.method == "medians": format = "%6.4f" elif options.method == "counts": format = "%i" MatlabTools.WriteMatrix(matrix, outfile=outfile, format=format, row_headers=options.species, col_headers=options.species) if options.filename_output: outfile.close() elif options.method in ("lists", "lists-union"): ## write lists of duplicated genes in species1 as compared to species2 ## according to location/function ## First field : gene name ## Second field: cluster id ## Third field : number of other genes in cluster ## Fourth field: location of gene written = {} for location in options.locations: for function in options.functions: values = [[[] for y in range(len(options.species))] for x in range(len(options.species))] data = GetSubset(input_data, location, function) ## sort by species1 and species2 data.sort() last_species1, last_species2 = None, None for species1, species2, cluster_id, l, f, height, locations, tree in data: if last_species1 != species1 or last_species2 != species2: ## write trees per cluster if options.filename_output: if options.method == "lists": if outfile: outfile.close() dict = { "f": function, "l": location, "s": species1, "o": species2 } written = {} outfile = open(options.filename_output % dict, "w") elif options.method == "lists-union": if last_species1 != species1: if outfile: outfile.close() dict = { "f": function, "l": location, "s": species1 } written = {} outfile = open( options.filename_output % dict, "w") else: outfile = sys.stdout if options.method == "lists": outfile.write( "location: %s, function: %s, species1: %s, species2: %s\n" % (location, function, species1, species2)) written = {} elif options.method == "lists-union": if last_species1 != species1: outfile.write( "location: %s, function: %s, species1: %s\n" % (location, function, species1)) written = {} last_species1 = species1 last_species2 = species2 # get tree tt = TreeTools.Newick2Tree(tree) taxa = TreeTools.GetTaxa(tt) for t in taxa: if t in written: continue outfile.write("%s\t%s\t%i\n" % (t, cluster_id, len(taxa))) written[t] = 1 elif options.method in ("hists", "fit-decay"): for location in options.locations: for function in options.functions: values = [[[] for y in range(len(options.species))] for x in range(len(options.species))] data = GetSubset(input_data, location, function) data.sort() ################################################################ ## convert to matrix of list ## values[x][y] contains heights of duplications in species x with reference to y for species1, species2, cluster_id, l, f, height, locations, tree in data: try: values[map_species2pos[species1]][ map_species2pos[species2]].append(float(height)) except KeyError: continue ################################################################ ################################################################ ################################################################ # calculate histograms per species ################################################################ for s in options.species: histograms = [] headers = [] if options.filename_output: dict = {"f": function, "l": location, "s": s} outfile = open(options.filename_output % dict, "w") else: outfile = sys.stdout outfile.write("location: %s, function: %s\n" % (location, function)) for x in range(len(options.species)): if options.reverse: ## duplications in species x vv = values[x][map_species2pos[s]] else: ## duplications in species s vv = values[map_species2pos[s]][x] if len(vv) == 0: pass else: headers.append(options.species[x]) h = Histogram.Calculate( vv, increment=options.bin_size, min_value=options.min_value, max_value=options.max_value, no_empty_bins=True) if options.method == "fit-decay": result = fit(h, [2.0, -1.0]) if result: outfile.write( "%s\t%s\t%s\t%i\t%f\t%f\ty = %f * exp ( %f * x )\n" % ( "function", s, options.species[x], h[0][1], result[0], result[1], result[0], result[1], )) elif options.method == "hists": histograms.append(h) if options.method == "hists": combined_histogram = Histogram.Combine( histograms, missing_value="-") outfile.write("bin\t" + "\t".join(headers) + "\n") Histogram.Write(outfile, combined_histogram) if options.filename_output: outfile.close() else: outfile.flush() elif options.method == "pairs": ## get branches with 0 branchlength for location in options.locations: if options.loglevel >= 2: options.stdlog.write("# processing location %s\n" % location) for function in options.functions: if options.loglevel >= 2: options.stdlog.write("# processing function %s " % function) options.stdlog.flush() data = GetSubset(input_data, location, function) if options.loglevel >= 2: options.stdlog.write("with %i data points\n" % len(data)) options.stdlog.flush() data.sort() last_species1, last_species2, last_cluster_id = None, None, None values = [] for species1, species2, cluster_id, l, f, height, locations, tree in data: if last_species1 != species1 or last_species2 != species2: ## write trees per cluster if options.filename_output: if outfile: outfile.close() dict = { "f": function, "l": location, "s": species1, "o": species2 } outfile = open(options.filename_output % dict, "w") else: outfile = sys.stdout outfile.write( "location: %s, function: %s, species1: %s, species2: %s\n" % (location, function, species1, species2)) last_species1 = species1 last_species2 = species2 last_cluster_id = None if last_cluster_id != cluster_id: if last_cluster_id != None: pass last_cluster_id = cluster_id outfile.write("%s\t%s\t%s\t%s\n" % (cluster_id, height, locations, tree)) elif options.method == "links": ## write a tree for each species pair: ## each node is a gene+location, the weight of the vertex is the height ## further info added: cluster_id for the duplication for location in options.locations: if options.loglevel >= 2: options.stdlog.write("# processing location %s\n" % location) for function in options.functions: if options.loglevel >= 2: options.stdlog.write("# processing function %s " % function) options.stdlog.flush() data = GetSubset(input_data, location, function) if options.loglevel >= 2: options.stdlog.write("with %i data points\n" % len(data)) options.stdlog.flush() ## stores duplications within first species as compared to second species values = [[[] for y in range(len(options.species))] for x in range(len(options.species))] for species1, species2, cluster_id, l, f, height, locations, tree in data: values[map_species2pos[species1]][ map_species2pos[species2]].append( (cluster_id, -len(locations), locations, tree)) # get links per species for s in options.species: if options.loglevel >= 2: options.stdlog.write("# processing species %s\n" % s) headers = [] for x in range(len(options.species)): if map_pos2species[x] == s: continue vv = values[map_species2pos[s]][x] vv.sort() ## write trees per cluster if options.filename_output: dict = { "f": function, "l": location, "s": s, "o": map_pos2species[x] } outfile = open(options.filename_output % dict, "w") else: outfile = sys.stdout outfile.write( "location: %s, function: %s, species1: %s, species2: %s\n" % (location, function, s, map_pos2species[x])) ## only print out largest tree last_cluster_id = None for cluster_id, n, locations, tree in vv: if cluster_id != last_cluster_id: outfile.write("%s\t%s\t%s\n" % (cluster_id, locations, tree)) last_cluster_id = cluster_id if options.filename_output: outfile.close() E.Stop()
def main(argv=None): if not argv: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: data2histogram.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-r", "--range", dest="range", type="string", help="range to calculate histogram for.") parser.add_option("-b", "--bin-size", dest="bin_size", type="string", help="bin size.") parser.add_option("-i", "--titles", dest="titles", action="store_true", help="use supplied column titles.") parser.add_option("--no-null", dest="nonull", action="store_true", help="do not output null values") parser.add_option("--no-titles", dest="titles", action="store_false", help="no column titles given.") parser.add_option( "-c", "--columns", dest="columns", type="string", help="columns to take for calculating histograms." ) parser.add_option( "--min-data", dest="min_data", type="int", help="minimum amount of data required, if less data, then the histogram will be empty [default=%default].", ) parser.add_option("--min-value", dest="min_value", type="float", help="minimum value for histogram.") parser.add_option("--max-value", dest="max_value", type="float", help="maximum value for histogram.") parser.add_option("--no-empty-bins", dest="no_empty_bins", action="store_true", help="do not display empty bins.") parser.add_option("--with-empty-bins", dest="no_empty_bins", action="store_false", help="display empty bins.") parser.add_option("--normalize", dest="normalize", action="store_true", help="normalize histogram.") parser.add_option("--cumulative", dest="cumulative", action="store_true", help="calculate cumulative histogram.") parser.add_option( "--reverse-cumulative", dest="reverse_cumulative", action="store_true", help="calculate reverse cumulative histogram.", ) parser.add_option("--headers", dest="headers", type="string", help="use the following headers.") parser.add_option( "--ignore-out-of-range", dest="ignore_out_of_range", action="store_true", help="ignore values that are out of range (as opposed to truncating them to range border.", ) parser.add_option("--missing", dest="missing_value", type="string", help="entry for missing values [%default].") parser.add_option( "--dynamic-bins", dest="dynamic_bins", action="store_true", help="each value constitutes its own bin." ) parser.add_option( "--on-the-fly", dest="on_the_fly", action="store_true", help="on the fly computation of histograms. Requires setting of min-value, max-value and bin_size.", ) parser.set_defaults( bin_size=None, range=None, titles=True, columns="all", append=(), no_empty_bins=True, min_value=None, max_value=None, normalize=False, cumulative=False, reverse_cumulative=False, nonull=None, ignore_out_of_range=False, min_data=1, headers=None, missing_value="na", dynamic_bins=False, on_the_fly=False, bin_format="%.2f", value_format="%6.4f", ) (options, args) = E.Start(parser) if options.columns != "all": options.columns = map(lambda x: int(x) - 1, options.columns.split(",")) if options.range: options.min_value, options.max_value = map(float, options.range.split(",")) if options.headers: options.headers = options.headers.split(",") if options.on_the_fly: if options.min_value == None or options.max_value == None or options.bin_size == None: raise "please supply columns, min-value, max-value and bin-size for on-the-fly computation." # try to glean titles from table: if options.titles: while 1: line = sys.stdin.readline() if not line: break if line[0] == "#": continue data = line[:-1].split("\t") break if options.columns == "all": options.titles = data options.columns = range(len(data)) else: options.titles = [data[x] for x in options.columns] bins = numpy.arange(options.min_value, options.max_value, float(options.bin_size)) hh = Histogram.fillHistograms(sys.stdin, options.columns, [bins for x in range(len(options.columns))]) n = len(hh) titles = ["bin"] if options.headers: titles.append(options.headers[x]) elif options.titles: titles.append(options.titles[x]) else: for x in options.columns: titles.append("col%i" % (x + 1)) if len(titles) > 1: options.stdout.write("\t".join(titles) + "\n") for x in range(len(bins)): v = [] v.append(options.bin_format % bins[x]) for c in range(n): v.append(options.value_format % hh[c][x]) options.stdout.write("\t".join(v) + "\n") else: ## in-situ computation of histograms # retrieve data first = True vals = [] # parse data, convert to floats for l in options.stdin: if l[0] == "#": continue data = string.split(l[:-1], "\t") if first: first = False ncols = len(data) if options.columns == "all": options.columns = range(ncols) vals = [[] for x in options.columns] if options.titles: try: options.titles = [data[x] for x in options.columns] except IndexError: raise IndexError, "not all columns %s found in data %s" % (str(options.columns), str(data)) continue for x in range(len(options.columns)): try: v = string.atof(data[options.columns[x]]) except IndexError: print "# IndexError in line:", l[:-1] continue except ValueError: continue vals[x].append(v) lines = None hists = [] titles = [] if not vals: if options.loglevel >= 1: options.stdlog.write("# no data\n") E.Stop() sys.exit(0) for x in range(len(options.columns)): if options.loglevel >= 1: options.stdlog.write("# column=%i, num_values=%i\n" % (options.columns[x], len(vals[x]))) if len(vals[x]) < options.min_data: continue h = Histogram.Calculate( vals[x], no_empty_bins=options.no_empty_bins, increment=options.bin_size, min_value=options.min_value, max_value=options.max_value, dynamic_bins=options.dynamic_bins, ignore_out_of_range=options.ignore_out_of_range, ) if options.normalize: h = Histogram.Normalize(h) if options.cumulative: h = Histogram.Cumulate(h) if options.reverse_cumulative: h = Histogram.Cumulate(h, direction=0) hists.append(h) for m in options.append: if m == "normalize": hists.append(Histogram.Normalize(h)) if options.headers: titles.append(options.headers[x]) elif options.titles: titles.append(options.titles[x]) else: titles.append("col%i" % options.columns[x]) if titles: options.stdout.write("bin\t" + "\t".join(titles) + "\n") if len(hists) == 1: Histogram.Print(hists[0], nonull=options.nonull) else: combined_histogram = Histogram.Combine(hists, missing_value=options.missing_value) Histogram.Print(combined_histogram, nonull=options.nonull) E.Stop()
order = [] for x in sort_order: if x in map_header2pos: order.append(map_header2pos[x]) new_headers = [headers[0]] new_histograms = [] for x in order: new_headers.append(headers[x]) new_histograms.append(histograms[x - 1]) histograms = new_histograms headers = new_headers combined_histogram = Histogram.Combine(histograms, param_missing_value) if headers: print "\t".join(headers) if param_normalize: combined_histogram = Histogram.Normalize(combined_histogram) Histogram.Print(combined_histogram, format_bin=param_format_bin, format_value=param_format_value, ) print E.GetFooter()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv try: optlist, args = getopt.getopt(sys.argv[1:], param_short_options, param_long_options) except getopt.error as msg: print(globals()["__doc__"], msg) sys.exit(1) for o, a in optlist: if o in ("--help", ): print(globals()["__doc__"]) sys.exit(0) elif o in ("--version", ): print("version=") sys.exit(0) elif o in ("-h", "--header-names"): param_headers = a.split(",") elif o in ("-n", "--normalize"): param_normalize = 1 elif o in ("-m", "--missing-value"): param_missing_value = a elif o == "--no-titles": param_titles = False elif o == "--no-titles": param_titles = False elif o in ("-f", "--format"): param_format = a elif o == "--format-value": param_format_value = a elif o == "--bin-format": param_format_bin = a elif o in ("-s", "--method=sort --sort-order"): if a in ("numerical", "alphabetic"): param_sort = a else: param_sort = a.split(",") if len(args) < 1: print(globals()["__doc__"], "please specify at one histogram.") sys.exit(1) param_filenames = args print(E.GetHeader()) print(E.GetParams()) histograms = [] # first headers = [ 'bin', ] if param_headers and headers != "auto": headers = [ param_headers[0], ] del param_headers[0] for x in range(len(param_filenames)): filename = param_filenames[x] if not os.path.exists(filename): print("# skipped because file not present: %s" % filename) continue file = IOTools.open_file(filename, "r") lines = [x for x in file if x[0] != "#"] if len(lines) == 0: continue if param_titles: h = lines[0][:-1].split("\t")[1:] del lines[0] if param_headers == "auto": headers.append(os.path.basename(filename)) elif param_headers: headers.append(param_headers[x]) elif param_titles: headers += h data = [list(map(float, x[:-1].split("\t"))) for x in lines] # add empty data point for empty histograms if len(data) == 0: data = [(0, 0)] histograms.append(data) # sort the whole thing: if param_sort: sort_order = [] if param_sort == "numerical": t = list( zip(list(map(int, headers[1:])), list(range(1, len(headers) + 1)))) t.sort() for tt in t: sort_order.append(headers[tt[1]]) elif param_sort == "alphabetical": t = list(zip(headers[1:], list(range(1, len(headers) + 1)))) t.sort() for tt in t: sort_order.append(headers[tt[1]]) else: sort_order = param_sort # map header to old position map_header2pos = {} for x in range(1, len(headers)): map_header2pos[headers[x]] = x order = [] for x in sort_order: if x in map_header2pos: order.append(map_header2pos[x]) new_headers = [headers[0]] new_histograms = [] for x in order: new_headers.append(headers[x]) new_histograms.append(histograms[x - 1]) histograms = new_histograms headers = new_headers combined_histogram = Histogram.Combine(histograms, param_missing_value) if headers: print("\t".join(headers)) if param_normalize: combined_histogram = Histogram.Normalize(combined_histogram) Histogram.Print( combined_histogram, format_bin=param_format_bin, format_value=param_format_value, ) print(E.GetFooter())
def main(argv=None): if not argv: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: data2histogram.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option("-r", "--range", dest="range", type="string", help="range to calculate histogram for.") parser.add_option("-b", "--bin-size", dest="bin_size", type="string", help="bin size.") parser.add_option("-i", "--titles", dest="titles", action="store_true", help="use supplied column titles.") parser.add_option("--no-null", dest="nonull", action="store_true", help="do not output null values") parser.add_option("--no-titles", dest="titles", action="store_false", help="no column titles given.") parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take for calculating histograms.") parser.add_option( "--min-data", dest="min_data", type="int", help= "minimum amount of data required, if less data, then the histogram will be empty [default=%default]." ) parser.add_option("--min-value", dest="min_value", type="float", help="minimum value for histogram.") parser.add_option("--max-value", dest="max_value", type="float", help="maximum value for histogram.") parser.add_option("--no-empty-bins", dest="no_empty_bins", action="store_true", help="do not display empty bins.") parser.add_option("--with-empty-bins", dest="no_empty_bins", action="store_false", help="display empty bins.") parser.add_option("--normalize", dest="normalize", action="store_true", help="normalize histogram.") parser.add_option("--cumulative", dest="cumulative", action="store_true", help="calculate cumulative histogram.") parser.add_option("--reverse-cumulative", dest="reverse_cumulative", action="store_true", help="calculate reverse cumulative histogram.") parser.add_option("--header-names", dest="headers", type="string", help="use the following headers.") parser.add_option( "--ignore-out-of-range", dest="ignore_out_of_range", action="store_true", help= "ignore values that are out of range (as opposed to truncating them to range border." ) parser.add_option("--missing-value", dest="missing_value", type="string", help="entry for missing values [%default].") parser.add_option("--use-dynamic-bins", dest="dynamic_bins", action="store_true", help="each value constitutes its own bin.") parser.add_option( "--on-the-fly", dest="on_the_fly", action="store_true", help= "on the fly computation of histograms. Requires setting of min-value, max-value and bin_size." ) parser.set_defaults( bin_size=None, range=None, titles=True, columns="all", append=(), no_empty_bins=True, min_value=None, max_value=None, normalize=False, cumulative=False, reverse_cumulative=False, nonull=None, ignore_out_of_range=False, min_data=1, headers=None, missing_value="na", dynamic_bins=False, on_the_fly=False, bin_format="%.2f", value_format="%6.4f", ) (options, args) = E.Start(parser) if options.columns != "all": options.columns = [int(x) - 1 for x in options.columns.split(",")] if options.range: options.min_value, options.max_value = list( map(float, options.range.split(","))) if options.headers: options.headers = options.headers.split(",") if options.on_the_fly: if options.min_value is None or options.max_value is None or \ options.bin_size is None: raise ValueError("please supply columns, min-value, max-value and " "bin-size for on-the-fly computation.") # try to glean titles from table: if options.titles: while 1: line = sys.stdin.readline() if not line: break if line[0] == "#": continue data = line[:-1].split("\t") break if options.columns == "all": options.titles = data options.columns = list(range(len(data))) else: options.titles = [data[x] for x in options.columns] bins = numpy.arange(options.min_value, options.max_value, float(options.bin_size)) hh = Histogram.fillHistograms( sys.stdin, options.columns, [bins for x in range(len(options.columns))]) n = len(hh) titles = ['bin'] if options.headers: titles.append(options.headers[x]) elif options.titles: titles.append(options.titles[x]) else: for x in options.columns: titles.append("col%i" % (x + 1)) if len(titles) > 1: options.stdout.write("\t".join(titles) + "\n") for x in range(len(bins)): v = [] v.append(options.bin_format % bins[x]) for c in range(n): v.append(options.value_format % hh[c][x]) options.stdout.write("\t".join(v) + "\n") else: # in-situ computation of histograms # retrieve data first = True vals = [] # parse data, convert to floats for l in options.stdin: if l[0] == "#": continue data = l[:-1].split("\t") if first: first = False ncols = len(data) if options.columns == "all": options.columns = list(range(ncols)) vals = [[] for x in options.columns] if options.titles: try: options.titles = [data[x] for x in options.columns] except IndexError: raise IndexError( "not all columns %s found in data %s" % (str(options.columns), str(data))) continue for x in range(len(options.columns)): try: v = float(data[options.columns[x]]) except IndexError: print("# IndexError in line:", l[:-1]) continue except ValueError: continue vals[x].append(v) lines = None hists = [] titles = [] if not vals: if options.loglevel >= 1: options.stdlog.write("# no data\n") E.Stop() sys.exit(0) for x in range(len(options.columns)): if options.loglevel >= 1: options.stdlog.write("# column=%i, num_values=%i\n" % (options.columns[x], len(vals[x]))) if len(vals[x]) < options.min_data: continue h = Histogram.Calculate( vals[x], no_empty_bins=options.no_empty_bins, increment=options.bin_size, min_value=options.min_value, max_value=options.max_value, dynamic_bins=options.dynamic_bins, ignore_out_of_range=options.ignore_out_of_range) if options.normalize: h = Histogram.Normalize(h) if options.cumulative: h = Histogram.Cumulate(h) if options.reverse_cumulative: h = Histogram.Cumulate(h, direction=0) hists.append(h) for m in options.append: if m == "normalize": hists.append(Histogram.Normalize(h)) if options.headers: titles.append(options.headers[x]) elif options.titles: titles.append(options.titles[x]) else: titles.append("col%i" % options.columns[x]) if titles: options.stdout.write("bin\t" + "\t".join(titles) + "\n") if len(hists) == 1: Histogram.Print(hists[0], nonull=options.nonull, format_bin=options.bin_format) else: combined_histogram = Histogram.Combine( hists, missing_value=options.missing_value) Histogram.Print(combined_histogram, nonull=options.nonull, format_bin=options.bin_format) E.Stop()
def main(argv=None): if argv == None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-n", "--nonull", dest="nonull", action="store_true", help="no null [default=%default]") parser.add_option("-e", "--show-empty", dest="empty_bins", action="store_true", help="show empty bins [default=%default]") parser.add_option("-o", "--normalize", dest="normalize", action="store_true", help="normalize histogram [default=%default]") parser.add_option("-i", "--titles", dest="titles", action="store_true", help="use titles supplied in ... [default=%default]") parser.add_option("--cumulative", dest="cumulative", action="store_true", help="compute cumulative histogram [default=%default]") parser.add_option( "--reverse-cumulative", dest="reverse_cumulative", action="store_true", help="compute reverse cumulative histogram [default=%default]") parser.add_option("-c", "--column", dest="column", type="int", help="columns to take [default=%default]") parser.add_option("-b", "--bin-size", dest="bin_size", type="float", help="bin size to use [default=%default]") parser.add_option("-u", "--upper", dest="upper_limit", type="float", help="upper limit to use [default=%default]") parser.add_option("-l", "--lower", dest="lower_limit", type="float", help="lower limit to use [default=%default]") parser.add_option("-s", "--scale", dest="scale", type="float", help="scale to use [default=%default]") parser.add_option("-a", "--append", dest="append", type="choice", action="append", choices=("normalize", ), help="append columns [default=%default]") parser.set_defaults(nonull=None, columns=[ 0, ], empty_bins=True, titles=False, lower_limit=None, upper_limit=None, bin_size=None, scale=None, normalize=None, append=[], cumulative=False, reverse_cumulative=False) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.columns: if options.columns != "all": options.columns = [int(x) - 1 for x in options.columns.split(",")] else: options.columns.append(0) histograms = [] vals = [] for x in options.columns: vals.append([]) # retrieve histogram lines = filter(lambda x: x[0] <> "#", sys.stdin.readlines()) ncols = len(string.split(lines[0][:-1], "\t")) if options.columns == "all": options.columns = range(ncols) for x in options.columns: vals.append([]) if options.titles: data = lines[0][:-1].split("\t") del lines[0] options.titles = map(lambda x: data[x], options.columns) for l in lines: data = string.split(l[:-1], "\t") for x in range(len(options.columns)): try: v = string.atof(data[options.columns[x]]) except IndexError: print "# IndexError in line:", l[:-1] continue except ValueError: continue if options.scale: v *= options.scale if options.upper_limit != None and v > options.upper_limit: v = options.upper_limit if options.lower_limit != None and v < options.lower_limit: v = options.lower_limit vals[x].append(v) lines = None hists = [] titles = [] for x in range(len(options.columns)): E.info("column=%i, num_values=%i" % (options.columns[x], len(vals[x]))) if len(vals[x]) == 0: continue h = Histogram.Calculate(vals[x], no_empty_bins=options.empty_bins, increment=options.bin_size) if options.scale: h = Histogram.Scale(h, 1.0 / options.scale) if options.normalize: h = Histogram.Normalize(h) if options.cumulative: h = Histogram.Cumulate(h) if options.reverse_cumulative: h = Histogram.Cumulate(h, direction=0) hists.append(h) for m in options.append: if m == "normalize": hists.append(Histogram.Normalize(h)) if options.titles: titles.append(options.titles[x]) if titles: options.stdout.write("bin\t" + "\t".join(titles) + "\n") if len(hists) == 1: Histogram.Print(hists[0], nonull=options.nonull) else: combined_histogram = Histogram.Combine(hists) Histogram.Print(combined_histogram, nonull=options.nonull) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--bin-size", dest="bin_size", type="string", help="bin size.") parser.add_option("--min-value", dest="min_value", type="float", help="minimum value for histogram.") parser.add_option( "--max-value", dest="max_value", type="float", help="maximum value for histogram.") parser.add_option( "--no-empty-bins", dest="no_empty_bins", action="store_true", help="do not display empty bins.") parser.add_option( "--with-empty-bins", dest="no_empty_bins", action="store_false", help="display empty bins.") parser.add_option( "--ignore-out-of-range", dest="ignore_out_of_range", action="store_true", help="ignore values that are out of range (as opposed to truncating " "them to range border.") parser.add_option("--missing-value", dest="missing_value", type="string", help="entry for missing values [%default].") parser.add_option("--use-dynamic-bins", dest="dynamic_bins", action="store_true", help="each value constitutes its own bin.") parser.add_option("--format", dest="format", type="choice", choices=("gff", "gtf", "bed"), help="input file format [%default].") parser.add_option("--method", dest="methods", type="choice", action="append", choices=("all", "hist", "stats", "overlaps", "values"), help="methods to apply [%default].") parser.add_option("--output-section", dest="output_section", type="choice", choices=("all", "size", "distance"), help="data to compute [%default].") parser.set_defaults( no_empty_bins=True, bin_size=None, dynamic_bins=False, ignore_out_of_range=False, min_value=None, max_value=None, nonull=None, missing_value="na", output_filename_pattern="%s", methods=[], output_section="all", format="gff", ) (options, args) = E.Start(parser, add_output_options=True) if "all" in options.methods: options.methods = ("hist", "stats", "overlaps") if not options.output_filename_pattern: options.output_filename_pattern = "%s" if len(options.methods) == 0: raise ValueError( "please provide counting method using --method option") if options.format in ("gff", "gtf"): gffs = GTF.iterator(options.stdin) elif options.format == "bed": gffs = Bed.iterator(options.stdin) values_between = [] values_within = [] values_overlaps = [] if "overlaps" in options.methods: if not options.output_filename_pattern: options.output_filename_pattern = "%s" outfile_overlaps = E.openOutputFile("overlaps") else: outfile_overlaps = None last = None ninput, noverlaps = 0, 0 for this in gffs: ninput += 1 values_within.append(this.end - this.start) if last and last.contig == this.contig: if this.start < last.end: noverlaps += 1 if outfile_overlaps: outfile_overlaps.write("%s\t%s\n" % (str(last), str(this))) values_overlaps.append( min(this.end, last.end) - max(last.start, this.start)) if this.end > last.end: last = this continue else: values_between.append(this.start - last.end) # if this.start - last.end < 10: # print str(last) # print str(this) # print "==" values_overlaps.append(0) last = this if "hist" in options.methods: outfile = E.openOutputFile("hist") h_within = Histogram.Calculate( values_within, no_empty_bins=options.no_empty_bins, increment=options.bin_size, min_value=options.min_value, max_value=options.max_value, dynamic_bins=options.dynamic_bins, ignore_out_of_range=options.ignore_out_of_range) h_between = Histogram.Calculate( values_between, no_empty_bins=options.no_empty_bins, increment=options.bin_size, min_value=options.min_value, max_value=options.max_value, dynamic_bins=options.dynamic_bins, ignore_out_of_range=options.ignore_out_of_range) if "all" == options.output_section: outfile.write("residues\tsize\tdistance\n") combined_histogram = Histogram.Combine( [h_within, h_between], missing_value=options.missing_value) Histogram.Write(outfile, combined_histogram, nonull=options.nonull) elif options.output_section == "size": outfile.write("residues\tsize\n") Histogram.Write(outfile, h_within, nonull=options.nonull) elif options.output_section == "distance": outfile.write("residues\tdistance\n") Histogram.Write(outfile, h_between, nonull=options.nonull) outfile.close() if "stats" in options.methods: outfile = E.openOutputFile("stats") outfile.write("data\t%s\n" % Stats.Summary().getHeader()) if options.output_section in ("size", "all"): outfile.write("size\t%s\n" % str(Stats.Summary(values_within))) if options.output_section in ("distance", "all"): outfile.write("distance\t%s\n" % str(Stats.Summary(values_between))) outfile.close() if "values" in options.methods: outfile = E.openOutputFile("distances") outfile.write("distance\n%s\n" % "\n".join(map(str, values_between))) outfile.close() outfile = E.openOutputFile("sizes") outfile.write("size\n%s\n" % "\n".join(map(str, values_within))) outfile.close() outfile = E.openOutputFile("overlaps") outfile.write("overlap\n%s\n" % "\n".join(map(str, values_overlaps))) outfile.close() E.info("ninput=%i, ndistance=%i, nsize=%i, noverlap=%i" % (ninput, len(values_between), len(values_within), noverlaps)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: graph2stats.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option("-r", "--range", dest="range", type="string", help="range to calculate histogram for.") parser.add_option("-b", "--bin-size", dest="bin_size", type="string", help="bin size.") parser.add_option("-i", "--titles", dest="titles", action="store_true", help="use supplied column titles.") parser.add_option("-s", "--make-symmetric", dest="make_symmetric", action="store_true", help="symmetrize graph.") parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take for calculating histograms.") parser.add_option("-p", "--output-pattern", dest="output_pattern", type="string", help="pattern for output files.") parser.add_option("-m", "--method", dest="method", type="string", help="method.") parser.add_option("-o", "--output-format", dest="output_format", type="string", help="output format.") parser.add_option("--min-value", dest="min_value", type="float", help="minimum value for histogram.") parser.add_option("--max-value", dest="max_value", type="float", help="maximum value for histogram.") parser.set_defaults(bin_size=None, range=None, titles=False, columns="all", append=(), empty_bins=False, min_value=None, max_value=None, normalize=False, cumulative=False, reverse_cumulative=False, nonull=None, make_symmetric=False, output_pattern="%s.hist", method="histograms", output_format="semi") (options, args) = E.Start(parser) if options.columns != "all": options.columns = map(lambda x: int(x) - 1, options.columns.split(",")) if options.range: options.min_value, options.max_value = map(float, options.range(split(","))) # retrieve data lines = filter(lambda x: x[0] <> "#", sys.stdin.readlines()) vals = {} if options.method == "histograms": ## read data for line in lines: v1, v2, w = line[:-1].split("\t")[:3] try: w[3] = float(w[3]) except ValueError: nerrors += 1 continnue if v1 not in vals: vals[v1] = {} if v2 not in vals[v1]: vals[v1][v2] = [] vals[v1][v2].append(w) if options.make_symmetric: if v2 not in vals: vals[v2] = {} if v1 not in vals[v2]: vals[v2][v1] = [] vals[v2][v1].append(w) ## convert to histograms for k1, vv in vals.items(): for k2 in vv.keys(): if len(vv[k2]) == 0: continue h = Histogram.Calculate(vv[k2], no_empty_bins=options.empty_bins, increment=options.bin_size, min_value=options.min_value, max_value=options.max_value) if options.normalize: h = Histogram.Normalize(h) if options.cumulative: h = Histogram.Cumulate(h) if options.reverse_cumulative: h = Histogram.Cumulate(h, direction=0) vv[k2] = h ## write output if options.output == "semi": for k1, vv in vals.items(): outfile = open(options.output_pattern % k1) kk2 = vv.keys() kk2.sort() hists = [] for k2 in kk2: hists.append(vv[k2]) PrintHistograms(outfile, kk2, hists, options) outfile.close() elif options.method == "counts": ## read data for line in lines: v1, v2 = line[:-1].split("\t")[:2] if v1 not in vals: vals[v1] = {} if v2 not in vals[v1]: vals[v1][v2] = 0 vals[v1][v2] += 1 if options.make_symmetric: if v2 not in vals: vals[v2] = {} if v1 not in vals[v2]: vals[v2][v1] = 0 vals[v2][v1] += 1 ## convert to histograms for k1, vv in vals.items(): for k2 in vv.keys(): options.stdout.write("%s\t%s\t%i\n" % (k1, k2, vv[k2])) E.Stop()
def PrintHistograms(outfile, titles, histograms, options): combined_histogram = Histogram.Combine(hists) outfile.write("\t".join(("bin", ) + titles)) Histogram.Print(combined_histogram, nonull=options.nonull)