示例#1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser.add_option("-c",
                      "--columns",
                      dest="columns",
                      type="string",
                      help="columns to take from table.")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="string",
                      help="methods to apply to columns.",
                      action="append")

    parser.add_option("-e",
                      "--echo",
                      dest="echo",
                      action="store_true",
                      help="echo columns not taken.")

    parser.add_option("-r",
                      "--replace",
                      dest="replace",
                      action="store_true",
                      help="replace orginial values.")

    parser.set_defaults(columns="1",
                        echo=False,
                        replace=False,
                        format="%5.2f",
                        methods=[])

    (options, args) = E.start(parser)

    options.columns = [int(x) - 1 for x in options.columns.split(",")]

    print(E.GetHeader())
    print(E.GetParams())

    vals = []

    # retrieve histogram
    lines = [x for x in sys.stdin.readlines() if x[0] != "#"]

    headers = lines[0][:-1].split("\t")
    del lines[0]

    notcolumns = [x for x in range(len(headers)) if x not in options.columns]

    data = [[] for x in range(len(headers))]

    for l in lines:
        d = l[:-1].split("\t")
        for c in options.columns:
            data[c].append(float(d[c]))
        for c in notcolumns:
            data[c].append(d[c])

    if len(data) == 0:
        raise ValueError("no data found")

    totals = [0] * len(headers)

    for c in options.columns:
        totals[c] = reduce(lambda x, y: x + y, data[c])

    new_columns = []
    new_headers = []

    if options.echo:
        for c in notcolumns:
            new_headers.append(headers[c])
            new_columns.append(data[c])

    for c in options.columns:
        if not options.replace:
            new_columns.append(data[c])
            new_headers.append(headers[c])

        for method in options.methods:
            if method == "normalize":
                new_columns.append([d / totals[c] for d in data[c]])
                new_headers.append("normalized")

    print(string.join(new_headers, "\t"))

    for d in zip(*new_columns):
        print(string.join(list(map(str, d)), "\t"))

    E.stop()
示例#2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    try:
        optlist, args = getopt.getopt(sys.argv[1:], param_short_options,
                                      param_long_options)

    except getopt.error as msg:
        print(globals()["__doc__"], msg)
        sys.exit(1)

    for o, a in optlist:
        if o in ("--help", ):
            print(globals()["__doc__"])
            sys.exit(0)
        elif o in ("--version", ):
            print("version=")
            sys.exit(0)
        elif o in ("-h", "--header-names"):
            param_headers = a.split(",")
        elif o in ("-n", "--normalize"):
            param_normalize = 1
        elif o in ("-m", "--missing-value"):
            param_missing_value = a
        elif o == "--no-titles":
            param_titles = False
        elif o == "--no-titles":
            param_titles = False
        elif o in ("-f", "--format"):
            param_format = a
        elif o == "--format-value":
            param_format_value = a
        elif o == "--bin-format":
            param_format_bin = a
        elif o in ("-s", "--method=sort --sort-order"):
            if a in ("numerical", "alphabetic"):
                param_sort = a
            else:
                param_sort = a.split(",")

    if len(args) < 1:
        print(globals()["__doc__"], "please specify at one histogram.")
        sys.exit(1)

    param_filenames = args

    print(E.GetHeader())
    print(E.GetParams())

    histograms = []

    # first
    headers = [
        'bin',
    ]
    if param_headers and headers != "auto":
        headers = [
            param_headers[0],
        ]
        del param_headers[0]

    for x in range(len(param_filenames)):

        filename = param_filenames[x]
        if not os.path.exists(filename):
            print("# skipped because file not present: %s" % filename)
            continue

        file = IOTools.open_file(filename, "r")

        lines = [x for x in file if x[0] != "#"]

        if len(lines) == 0:
            continue

        if param_titles:
            h = lines[0][:-1].split("\t")[1:]
            del lines[0]

        if param_headers == "auto":
            headers.append(os.path.basename(filename))
        elif param_headers:
            headers.append(param_headers[x])
        elif param_titles:
            headers += h

        data = [list(map(float, x[:-1].split("\t"))) for x in lines]

        # add empty data point for empty histograms
        if len(data) == 0:
            data = [(0, 0)]

        histograms.append(data)

    # sort the whole thing:
    if param_sort:
        sort_order = []

        if param_sort == "numerical":
            t = list(
                zip(list(map(int, headers[1:])),
                    list(range(1,
                               len(headers) + 1))))
            t.sort()

            for tt in t:
                sort_order.append(headers[tt[1]])

        elif param_sort == "alphabetical":
            t = list(zip(headers[1:], list(range(1, len(headers) + 1))))
            t.sort()

            for tt in t:
                sort_order.append(headers[tt[1]])
        else:
            sort_order = param_sort

        # map header to old position
        map_header2pos = {}
        for x in range(1, len(headers)):
            map_header2pos[headers[x]] = x

        order = []
        for x in sort_order:
            if x in map_header2pos:
                order.append(map_header2pos[x])

        new_headers = [headers[0]]
        new_histograms = []

        for x in order:
            new_headers.append(headers[x])
            new_histograms.append(histograms[x - 1])

        histograms = new_histograms
        headers = new_headers

    combined_histogram = Histogram.Combine(histograms, param_missing_value)

    if headers:
        print("\t".join(headers))

    if param_normalize:
        combined_histogram = Histogram.Normalize(combined_histogram)

    Histogram.Print(
        combined_histogram,
        format_bin=param_format_bin,
        format_value=param_format_value,
    )

    print(E.GetFooter())
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    param_long_options = [
        "verbose=", "help", "split-regex=", "after", "pattern-output=", "skip",
        "column=", "map=", "dry-run",
        "header", "remove-key", "append", "pattern-identifier=", "version",
        "chunk-size="]

    param_short_options = "v:hr:ap:sc:dek"

    param_loglevel = 1
    param_split_at_regex = None
    param_after = None
    param_skip = None
    param_pattern_output = "%s.chunk"
    param_split_column = None
    param_filename_map = None
    param_dry_run = False
    param_header = False
    param_remove_key = False
    param_append = "w"
    param_pattern_identifier = None
    param_chunk_size = 1

    try:
        optlist, args = getopt.getopt(sys.argv[1:],
                                      param_short_options,
                                      param_long_options)

    except getopt.error as msg:
        print(USAGE, msg)
        sys.exit(1)

    for o, a in optlist:
        if o in ("-v", "--verbose"):
            param_loglevel = int(a)
        elif o in ("--version", ):
            print("version=")
            sys.exit(0)
        elif o in ("-h", "--help"):
            print(USAGE)
            sys.exit(0)
        elif o in ("-r", "--split-regex"):
            param_split_at_regex = re.compile(a)
        elif o in ("-a", "--after"):
            param_after = 1
        elif o in ("-s", "--skip"):
            param_skip = 1
        elif o in ("-p", "--pattern-output"):
            param_pattern_output = a
        elif o in ("-c", "--column"):
            param_split_column = int(a) - 1
        elif o in ("-m", "--map"):
            param_filename_map = a
        elif o in ("-d", "--dry-run"):
            param_dry_run = True
        elif o in ("-e", "--header-names"):
            param_header = True
        elif o in ("-r", "--remove-key"):
            param_remove_key = True
        elif o == "--append":
            param_append = "a"
        elif o == "--pattern-identifier":
            param_pattern_identifier = re.compile(a)
        elif o == "--chunk-size":
            param_chunk_size = int(a)

    print(E.GetHeader())
    print(E.GetParams())

    mymap = {}
    if param_filename_map:
        infile = IOTools.open_file(param_filename_map, "r")
        for line in infile:
            if line[0] == "#":
                continue
            data = line[:-1].split("\t")[:2]
            mymap[data[0]] = data[1]

    filenames = set()
    found = set()
    ninput, noutput = 0, 0

    if param_split_column is not None:

        header = None
        files = {}
        for line in sys.stdin:

            if line[0] == "#":
                continue

            ninput += 1

            if param_header:
                if not header:
                    header = line[:-1]
                    continue
            else:
                header = None

            data = line[:-1].split("\t")

            try:
                key = data[param_split_column]
            except ValueError:
                continue

            if param_pattern_identifier:
                key = param_pattern_identifier.search(key).groups()[0]

            if mymap:
                if key in mymap:
                    key = mymap[key]
                else:
                    continue

            found.add(key)

            filename = re.sub("%s", key, param_pattern_output)
            filenames.add(filename)

            if filename not in files:

                # reset if too many files are open
                if len(files) > 1000:
                    if param_loglevel >= 1:
                        print("# resetting all files.")
                        sys.stdout.flush()

                    for f in list(files.values()):
                        f.close()
                    files = {}

                files[filename] = CreateOpen(
                    filename, "a", param_dry_run, header)

            if param_remove_key:
                del data[param_split_column]
                files[filename].write(string.join(data, "\t") + "\n")
            else:
                files[filename].write(line)

            noutput += 1

        for f in list(files.values()):
            f.close()

    else:
        file_id = 0

        filename = re.sub("%s", str(file_id), param_pattern_output)
        outfile = CreateOpen(filename, param_append, param_dry_run)
        nlines = 0

        header = param_header
        split = 0

        for line in sys.stdin:

            if param_split_at_regex and param_split_at_regex.search(line[:-1]):
                split += 1

            if split == param_chunk_size:
                if param_after:
                    nlines += 1
                    outfile.write(line)
                if nlines > 0:
                    outfile.close()
                    file_id += 1
                    filename = re.sub("%s", str(file_id), param_pattern_output)
                    outfile = CreateOpen(
                        filename, param_append, param_dry_run, header)
                    filenames.add(filename)
                    split = 0

                nlines = 0
                if param_after or param_skip:
                    continue

            outfile.write(line)
            nlines += 1

        outfile.close()

    if param_loglevel >= 1:
        sys.stdout.write(
            "# ninput=%i, noutput=%i, nfound=%i, nnotfound=%i, nfiles=%i\n" % (
                ninput,
                noutput,
                len(found),
                len(set(mymap).difference(found)),
                len(filenames)))

    print(E.GetFooter())