Exemplo n.º 1
0
def read_and_collapse_table(infile, options, missing_value=""):
    '''collapse a table.

    Collapse a table of two columns with row names in the first
    column. Outputs a table with multiple columns for each row name.
    '''

    fields, table = CSV.readTable(infile,
                                  with_header=options.has_headers,
                                  as_rows=True)

    if len(fields) != 2:
        raise NotImplementedError("can only work on tables with two columns")

    values = collections.defaultdict(list)

    # column header after which to add
    separator = table[0][0]
    row_names = set([x[0] for x in table])

    row_name, value = table[0]

    values[row_name].append(value)
    added = set([row_name])
    for row_name, value in table[1:]:
        if row_name == separator:
            for r in row_names:
                if r not in added:
                    values[r].append(missing_value)
            added = set()

        values[row_name].append(value)
        added.add(row_name)

    for r in row_names:
        if r not in added:
            values[r].append(missing_value)

    sizes = set([len(x) for x in list(values.values())])
    assert len(sizes) == 1, "unequal number of row_names"
    size = list(sizes)[0]

    options.stdout.write("row\t%s\n" %
                         ("\t".join(["column_%i" % x for x in range(size)])))

    for key, row in list(values.items()):
        options.stdout.write("%s\t%s\n" % (key, "\t".join(row)))
Exemplo n.º 2
0
def getGODescriptions(infile):
    '''build dictionary mapping GOids to types and descriptions.

    Arguments
    ---------
    infile : string
        Filename of table with GO assignments

    Returns
    -------
    mapping : dict
        Dictionary mapping GOid to GOtype and GOdescription.
    '''

    with IOTools.open_file(infile) as inf:
        fields, table = CSV.readTable(inf, as_rows=False)

    return dict([
        (y, (x, z)) for x, y, z in zip(table[fields.index("go_type")], table[
            fields.index("go_id")], table[fields.index("description")])
    ])
Exemplo n.º 3
0
def computeFDR(infile, options):
    '''compute FDR on a table.
    '''

    fields, table = CSV.readTable(infile,
                                  with_header=options.has_headers,
                                  as_rows=True)

    options.stdout.write("\t".join(fields) + "\n")

    for row in table:

        data = []
        for x in range(len(fields)):
            data.append(row[x].split(options.separator))

        nrows = max([len(d) for d in data])

        for d in data:
            d += [""] * (nrows - len(d))

        for n in range(nrows):
            options.stdout.write("\t".join([d[n] for d in data]) + "\n")
Exemplo n.º 4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="choice",
        action="append",
        choices=("transpose", "normalize-by-max", "normalize-by-value",
                 "multiply-by-value", "percentile", "remove-header",
                 "normalize-by-table", "upper-bound", "lower-bound",
                 "kullback-leibler", "expand", "compress", "fdr", "grep"),
        help="""actions to perform on table.""")

    parser.add_option("-s",
                      "--scale",
                      dest="scale",
                      type="float",
                      help="factor to scale matrix by.")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="output number format [default]")

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="Parameters for various functions.")

    parser.add_option("-t",
                      "--header-names",
                      dest="has_headers",
                      action="store_true",
                      help="matrix has row/column headers.")

    parser.add_option("--transpose",
                      dest="transpose",
                      action="store_true",
                      help="transpose table.")

    parser.add_option(
        "--set-transpose-field",
        dest="set_transpose_field",
        type="string",
        help="set first field (row 1 and col 1) to this value [%default].")

    parser.add_option("--transpose-format",
                      dest="transpose_format",
                      type="choice",
                      choices=(
                          "default",
                          "separated",
                      ),
                      help="input format of un-transposed table")

    parser.add_option(
        "--expand",
        dest="expand_table",
        action="store_true",
        help="expand table - multi-value cells with be expanded over "
        "several rows.")

    parser.add_option("--no-headers",
                      dest="has_headers",
                      action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option("--columns",
                      dest="columns",
                      type="string",
                      help="columns to use.")

    parser.add_option("--file",
                      dest="file",
                      type="string",
                      help="columns to test from table.",
                      metavar="FILE")

    parser.add_option("-d",
                      "--delimiter",
                      dest="delimiter",
                      type="string",
                      help="delimiter of columns.",
                      metavar="DELIM")

    parser.add_option("-V",
                      "--invert-match",
                      dest="invert_match",
                      action="store_true",
                      help="invert match.")

    parser.add_option("--sort-by-rows",
                      dest="sort_rows",
                      type="string",
                      help="output order for rows.")

    parser.add_option("-a",
                      "--value",
                      dest="value",
                      type="float",
                      help="value to use for various algorithms.")

    parser.add_option("--group",
                      dest="group_column",
                      type="int",
                      help="group values by column. Supply an integer column "
                      "[default=%default]")

    parser.add_option("--group-function",
                      dest="group_function",
                      type="choice",
                      choices=("min", "max", "sum", "mean", "stats", "cat",
                               "uniq"),
                      help="function to group values by.")

    parser.add_option("--join-table",
                      dest="join_column",
                      type="int",
                      help="join rows in a table by columns.")

    parser.add_option(
        "--collapse-table",
        dest="collapse_table",
        type="string",
        help="collapse a table. Value determines the missing variable "
        "[%default].")

    parser.add_option("--join-column-name",
                      dest="join_column_name",
                      type="int",
                      help="use this column as a prefix.")

    parser.add_option("--flatten-table",
                      dest="flatten_table",
                      action="store_true",
                      help="flatten a table [%default].")

    parser.add_option("--as-column",
                      dest="as_column",
                      action="store_true",
                      help="output table as a single column.")

    parser.add_option("--split-fields",
                      dest="split_fields",
                      action="store_true",
                      help="split fields.")

    parser.add_option(
        "--separator",
        dest="separator",
        type="string",
        help="separator for multi-valued fields [default=%default].")

    parser.add_option(
        "--fdr-method",
        dest="fdr_method",
        type="choice",
        choices=("BH", "bonferroni", "holm", "hommel", "hochberg", "BY"),
        help="method to perform multiple testing correction by controlling "
        "the fdr [default=%default].")

    parser.add_option(
        "--fdr-add-column",
        dest="fdr_add_column",
        type="string",
        help="add new column instead of replacing existing columns. "
        "The value of the option will be used as prefix if there are "
        "multiple columns [%default]")

    # IMS: add option to use a column as the row id in flatten
    parser.add_option(
        "--id-column",
        dest="id_column",
        type="string",
        help="list of column(s) to use as the row id when flattening "
        "the table. If None, then row number is used. [default=%default].")

    parser.add_option(
        "--variable-name",
        dest="variable_name",
        type="string",
        help="the column header for the 'variable' column when flattening "
        "[default=%default].")

    parser.add_option(
        "--value-name",
        dest="value_name",
        type="string",
        help="the column header for the 'value' column when flattening "
        "[default=%default].")

    parser.set_defaults(
        methods=[],
        scale=1.0,
        has_headers=True,
        format=None,
        value=0.0,
        parameters="",
        columns="all",
        transpose=False,
        set_transpose_field=None,
        transpose_format="default",
        group=False,
        group_column=0,
        group_function="mean",
        missing_value="na",
        sort_rows=None,
        flatten_table=False,
        collapse_table=None,
        separator=";",
        expand=False,
        join_column=None,
        join_column_name=None,
        compute_fdr=None,
        as_column=False,
        fdr_method="BH",
        fdr_add_column=None,
        id_column=None,
        variable_name="column",
        value_name="value",
        file=None,
        delimiter="\t",
        invert_match=False,
    )

    (options, args) = E.start(parser, add_pipe_options=True)

    options.parameters = options.parameters.split(",")

    if options.group_column:
        options.group = True
        options.group_column -= 1

    ######################################################################
    ######################################################################
    ######################################################################
    # if only to remove header, do this quickly
    if options.methods == ["remove-header"]:

        first = True
        for line in options.stdin:
            if line[0] == "#":
                continue
            if first:
                first = False
                continue
            options.stdout.write(line)

    elif options.transpose or "transpose" in options.methods:

        readAndTransposeTable(options.stdin, options)

    elif options.flatten_table:
        # IMS: bug fixed to make work. Also added options for keying
        # on a particular and adding custom column headings

        fields, table = CSV.readTable(options.stdin,
                                      with_header=options.has_headers,
                                      as_rows=True)

        options.columns = getColumns(fields, options.columns)

        if options.id_column:
            id_columns = [int(x) - 1 for x in options.id_column.split(",")]
            id_header = "\t".join(
                [fields[id_column] for id_column in id_columns])
            options.columns = [
                x for x in options.columns if x not in id_columns
            ]
        else:
            id_header = "row"

        options.stdout.write(
            "%s\t%s\t%s\n" %
            (id_header, options.variable_name, options.value_name))

        for x, row in enumerate(table):

            if options.id_column:
                row_id = "\t".join(
                    [row[int(x) - 1] for x in options.id_column.split(",")])
            else:
                row_id = str(x)

            for y in options.columns:
                options.stdout.write("%s\t%s\t%s\n" %
                                     (row_id, fields[y], row[y]))

    elif options.as_column:

        fields, table = CSV.readTable(options.stdin,
                                      with_header=options.has_headers,
                                      as_rows=True)
        options.columns = getColumns(fields, options.columns)
        table = list(zip(*table))

        options.stdout.write("value\n")

        for column in options.columns:
            options.stdout.write("\n".join(table[column]) + "\n")

    elif options.split_fields:

        # split comma separated fields
        fields, table = CSV.readTable(options.stdin,
                                      with_header=options.has_headers,
                                      as_rows=True)

        options.stdout.write("%s\n" % ("\t".join(fields)))

        for row in table:
            row = [x.split(options.separator) for x in row]
            for d in itertools.product(*row):
                options.stdout.write("%s\n" % "\t".join(d))

    elif options.group:
        readAndGroupTable(options.stdin, options)

    elif options.join_column:
        readAndJoinTable(options.stdin, options)

    elif options.expand_table:
        readAndExpandTable(options.stdin, options)

    elif options.collapse_table is not None:
        readAndCollapseTable(options.stdin, options, options.collapse_table)

    elif "grep" in options.methods:

        options.columns = [int(x) - 1 for x in options.columns.split(",")]

        patterns = []

        if options.file:
            infile = IOTools.open_file(options.file, "r")
            for line in infile:
                if line[0] == "#":
                    continue
                patterns.append(line[:-1].split(options.delimiter)[0])
        else:
            patterns = args

        for line in options.stdin:

            data = line[:-1].split(options.delimiter)
            found = False

            for c in options.columns:

                if data[c] in patterns:
                    found = True
                    break

            if (not found and options.invert_match) or (
                    found and not options.invert_match):
                print(line[:-1])
    else:

        ######################################################################
        ######################################################################
        ######################################################################
        # Apply remainder of transformations
        fields, table = CSV.readTable(options.stdin,
                                      with_header=options.has_headers,
                                      as_rows=False)
        # convert columns to list
        table = [list(x) for x in table]

        ncols = len(fields)
        if len(table) == 0:
            raise ValueError("table is empty")

        nrows = len(table[0])

        E.info("processing table with %i rows and %i columns" % (nrows, ncols))

        options.columns = getColumns(fields, options.columns)

        # convert all values to float
        for c in options.columns:
            for r in range(nrows):
                try:
                    table[c][r] = float(table[c][r])
                except ValueError:
                    continue

        for method in options.methods:

            if method == "normalize-by-value":

                value = float(options.parameters[0])
                del options.parameters[0]

                for c in options.columns:
                    table[c] = [x / value for x in table[c]]

            elif method == "multiply-by-value":

                value = float(options.parameters[0])
                del options.parameters[0]

                for c in options.columns:
                    table[c] = [x * value for x in table[c]]

            elif method == "normalize-by-max":

                for c in options.columns:
                    m = max(table[c])
                    table[c] = [x / m for x in table[c]]

            elif method == "kullback-leibler":
                options.stdout.write("category1\tcategory2\tkl1\tkl2\tmean\n")
                format = options.format
                if format is None:
                    format = "%f"

                for x in range(0, len(options.columns) - 1):
                    for y in range(x + 1, len(options.columns)):
                        c1 = options.columns[x]
                        c2 = options.columns[y]
                        e1 = 0
                        e2 = 0
                        for z in range(nrows):
                            p = table[c1][z]
                            q = table[c2][z]
                            e1 += p * math.log(p / q)
                            e2 += q * math.log(q / p)

                        options.stdout.write(
                            "%s\t%s\t%s\t%s\t%s\n" %
                            (fields[c1], fields[c2], format % e1, format % e2,
                             format % ((e1 + e2) / 2)))
                E.stop()
                sys.exit(0)

            elif method == "rank":

                for c in options.columns:
                    tt = table[c]
                    t = list(zip(tt, list(range(nrows))))
                    t.sort()
                    for i, n in zip([x[1] for x in t], list(range(nrows))):
                        tt[i] = n

            elif method in ("lower-bound", "upper-bound"):

                boundary = float(options.parameters[0])
                del options.parameters[0]
                new_value = float(options.parameters[0])
                del options.parameters[0]

                if method == "upper-bound":
                    for c in options.columns:
                        for r in range(nrows):
                            if isinstance(table[c][r], float) and \
                                    table[c][r] > boundary:
                                table[c][r] = new_value
                else:
                    for c in options.columns:
                        for r in range(nrows):
                            if isinstance(table[c][r], float) and \
                                    table[c][r] < boundary:
                                table[c][r] = new_value

            elif method == "fdr":
                pvalues = []
                for c in options.columns:
                    pvalues.extend(table[c])

                assert max(pvalues) <= 1.0, "pvalues > 1 in table: max=%s" % \
                    str(max(pvalues))
                assert min(pvalues) >= 0, "pvalue < 0 in table: min=%s" % \
                    str(min(pvalues))

                # convert to str to avoid test for float downstream
                qvalues = list(
                    map(
                        str,
                        Stats.adjustPValues(pvalues,
                                            method=options.fdr_method)))

                if options.fdr_add_column is None:
                    x = 0
                    for c in options.columns:
                        table[c] = qvalues[x:x + nrows]
                        x += nrows
                else:
                    # add new column headers
                    if len(options.columns) == 1:
                        fields.append(options.fdr_add_column)
                    else:
                        for co in options.columns:
                            fields.append(options.fdr_add_column + fields[c])

                    x = 0
                    for c in options.columns:
                        # add a new column
                        table.append(qvalues[x:x + nrows])
                        x += nrows
                    ncols += len(options.columns)

            elif method == "normalize-by-table":

                other_table_name = options.parameters[0]
                del options.parameters[0]
                other_fields, other_table = CSV.readTable(
                    IOTools.open_file(other_table_name, "r"),
                    with_header=options.has_headers,
                    as_rows=False)

                # convert all values to float
                for c in options.columns:
                    for r in range(nrows):
                        try:
                            other_table[c][r] = float(other_table[c][r])
                        except ValueError:
                            continue

                # set 0s to 1 in the other matrix
                for c in options.columns:
                    for r in range(nrows):
                        if isinstance(table[c][r], float) and \
                                isinstance(other_table[c][r], float) and \
                                other_table[c][r] != 0:
                            table[c][r] /= other_table[c][r]
                        else:
                            table[c][r] = options.missing_value

        # convert back
        if options.format is not None:
            for c in options.columns:
                for r in range(nrows):
                    if isinstance(table[c][r], float):
                        table[c][r] = format % table[c][r]

        options.stdout.write("\t".join(fields) + "\n")
        if options.sort_rows:
            old2new = {}
            for r in range(nrows):
                old2new[table[0][r]] = r
            for x in options.sort_rows.split(","):
                if x not in old2new:
                    continue
                r = old2new[x]
                options.stdout.write(
                    "\t".join(map(str, [table[c][r]
                                        for c in range(ncols)])) + "\n")
        else:
            for r in range(nrows):
                options.stdout.write(
                    "\t".join(map(str, [table[c][r]
                                        for c in range(ncols)])) + "\n")

    E.stop()
Exemplo n.º 5
0
def readAndGroupTable(infile, options):
    """read table from infile and group.
    """
    fields, table = CSV.readTable(infile,
                                  with_header=options.has_headers,
                                  as_rows=True)
    options.columns = getColumns(fields, options.columns)
    assert options.group_column not in options.columns

    converter = float
    new_fields = [fields[options.group_column]
                  ] + [fields[x] for x in options.columns]

    if options.group_function == "min":
        f = min
    elif options.group_function == "max":
        f = max
    elif options.group_function == "sum":
        f = lambda z: reduce(lambda x, y: x + y, z)
    elif options.group_function == "mean":
        f = scipy.mean
    elif options.group_function == "cat":
        f = lambda x: ";".join([y for y in x if y != ""])
        converter = str
    elif options.group_function == "uniq":
        f = lambda x: ";".join([y for y in set(x) if y != ""])
        converter = str
    elif options.group_function == "stats":
        f = lambda x: str(Stats.DistributionalParameters(x))
        # update headers
        new_fields = [fields[options.group_column]]
        for c in options.columns:
            new_fields += list([
                "%s_%s" % (fields[c], x)
                for x in Stats.DistributionalParameters().getHeaders()
            ])

    # convert values to floats (except for group_column)
    # Delete rows with unconvertable values and not in options.columns
    new_table = []
    for row in table:
        skip = False
        new_row = [row[options.group_column]]

        for c in options.columns:
            if row[c] == options.missing_value:
                new_row.append(row[c])
            else:
                try:
                    new_row.append(converter(row[c]))
                except ValueError:
                    skip = True
                    break
        if not skip:
            new_table.append(new_row)
    table = new_table

    new_rows = CSV.groupTable(table, group_column=0, group_function=f)

    options.stdout.write("\t".join(new_fields) + "\n")
    for row in new_rows:
        options.stdout.write("\t".join(map(str, row)) + "\n")
Exemplo n.º 6
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--output-filename-pattern",
                      dest="output_filename_pattern",
                      type="string",
                      help="pattern for additional output files [%default].")

    parser.set_defaults(
        length=1000,
        minimum_coverage=0.90,
        maximum_reads=[1, 10, 20, 50, 100],
        output_filename_pattern="%s",
        normalize=True,
    )

    (options, args) = E.start(parser, add_csv_options=True)

    fields, table = CSV.readTable(sys.stdin, dictreader=CSV.DictReaderLarge)

    map_fields2column = {}
    for x in fields:
        map_fields2column[x] = len(map_fields2column)

    coverage_5prime = numpy.zeros(options.length, numpy.float)
    coverage_3prime = numpy.zeros(options.length, numpy.float)

    coverage_maxreads5prime = numpy.zeros(options.length, numpy.float)
    coverage_maxreads3prime = numpy.zeros(options.length, numpy.float)

    coverage_full5prime = numpy.zeros(options.length, numpy.float)
    coverage_full3prime = numpy.zeros(options.length, numpy.float)

    coverage_min5prime = numpy.zeros(options.length, numpy.float)
    coverage_min3prime = numpy.zeros(options.length, numpy.float)

    histograms = []
    for x in range(len(options.maximum_reads)):
        histograms.append([
            numpy.zeros(options.length, numpy.float),
            numpy.zeros(options.length, numpy.float), 0
        ])

    ninput, noutput, nfull, nmincov, nskipped, nlength, nmaxreads = 0, 0, 0, 0, 0, 0, 0
    for row in table:
        length, covered, meancov, data, nreads = (int(row["cov_nval"]),
                                                  float(row["cov_covered"]),
                                                  float(row["cov_mean"]),
                                                  row["cov_values"],
                                                  int(row["nover2"]))
        ninput += 1
        if length < options.length:
            nlength += 1
            continue

        if data == "na":
            nskipped += 1
            continue

        noutput += 1
        mincov = covered / length
        values = list(map(float, data.split(";")))
        m = max(values)
        values = [x / m for x in values]
        coverage_5prime += values[0:1000]
        coverage_3prime += values[-1000:]

        if mincov >= 1.0:
            coverage_full5prime += values[0:1000]
            coverage_full3prime += values[-1000:]
            nfull += 1

        if meancov >= options.minimum_coverage:
            coverage_min5prime += values[0:1000]
            coverage_min3prime += values[-1000:]
            nmincov += 1

        for maxreads in range(len(options.maximum_reads)):
            if nreads <= options.maximum_reads[maxreads]:
                histograms[maxreads][0] += values[0:1000]
                histograms[maxreads][1] += values[-1000:]
                histograms[maxreads][2] += 1

    if options.normalize:
        for x5, x3 in ((coverage_5prime, coverage_3prime),
                       (coverage_min5prime, coverage_min3prime),
                       (coverage_full5prime, coverage_full3prime)):
            m = max((max(x5), max(x3)))
            x3 /= m
            x5 /= m

        for x5, x3, c in histograms:
            m = max((max(x5), max(x3)))
            x5 /= m
            x3 /= m

    outfile = options.stdout
    outfile.write("\t".join(("distance", "minlen-5'", "minlen-3'", "mincov-5'",
                             "mincov-3'", "full-5'", "full-3'")) + "\n")

    for x in range(0, options.length):
        outfile.write("\t".join([
            "%6.4f" % x
            for x in (x, coverage_5prime[x], coverage_3prime[x],
                      coverage_min5prime[x], coverage_min3prime[x],
                      coverage_full5prime[x], coverage_full3prime[x])
        ]) + "\n")

    outfile5 = IOTools.open_file(options.output_filename_pattern % "reads5",
                                 "w")
    outfile3 = IOTools.open_file(options.output_filename_pattern % "reads3",
                                 "w")

    outfile5.write("\t".join([
        "distance",
    ] + [
        "reads%i" % options.maximum_reads[y]
        for y in range(len(options.maximum_reads))
    ]) + "\n")
    outfile3.write("\t".join([
        "distance",
    ] + [
        "reads%i" % options.maximum_reads[y]
        for y in range(len(options.maximum_reads))
    ]) + "\n")
    for x in range(0, options.length):
        outfile5.write("%i\t%s\n" % (x, "\t".join([
            "%6.4f" % histograms[y][0][x]
            for y in range(len(options.maximum_reads))
        ])))
        outfile3.write("%i\t%s\n" % (x, "\t".join([
            "%6.4f" % histograms[y][1][x]
            for y in range(len(options.maximum_reads))
        ])))

    E.info(
        "ninput=%i, noutput=%i, nmaxreads=%i, nfull=%i, nmincov=%i, nskipped=%i, nlength=%i"
        % (ninput, noutput, nmaxreads, nfull, nmincov, nskipped, nlength))

    E.stop()
Exemplo n.º 7
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: data2bins.py 2782 2009-09-10 11:40:29Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("--column", dest="column", type="int",
                      help="column to split on.")

    parser.add_option("--num-bins", dest="num_bins", type="int",
                      help="number of bins to create.")

    parser.add_option("--method", dest="method", type="choice",
                      choices=("equal-sized-bins",),
                      help="method to use to bin data.")

    parser.add_option("--no-headers", dest="has_headers", action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string",
                      help="OUTPUT filename with histogram information on aggregate coverages [%default].")

    parser.set_defaults(
        has_headers=True,
        method="equal-sized-bins",
        column=1,
        num_bins=4,
        output_filename_pattern="bin%i",
    )

    (options, args) = E.start(parser)
    options.column -= 1

    if args:
        if args[0] == "-":
            infile = sys.stdin
        else:
            infile = IOTools.open_file(args[0], "r")
    else:
        infile = sys.stdin

    fields, data = CSV.readTable(infile)

    c = options.column
    values = [float(x[c]) for x in data]

    bins = []

    if options.method == "equal-sized-bins":
        increment = int(math.floor(float(len(values)) / options.num_bins))
        indices = list(range(0, len(values)))
        indices.sort(key=lambda x: values[x])
        for x in range(len(values)):
            values[indices[x]] = x
        bins = list(range(0, len(values) - increment, increment))

    elif options.method == "pass":
        pass

    E.debug("bins=%s" % str(bins))

    outputters = []
    for x in range(0, len(bins)):
        outputters.append(
            Outputter(options.output_filename_pattern % x, fields))

    # output tables
    for x in range(0, len(data)):
        bin = bisect.bisect(bins, values[x]) - 1
        outputters[bin].write(data[x])

    # stats
    if options.loglevel >= 1:
        options.stdlog.write("# bin\tstart\tcounts\tfilename\n")
        for x in range(0, len(bins)):
            options.stdlog.write("# %i\t%f\t%i\t%s\n" % (
                x, bins[x], outputters[x].mCounts, outputters[x].mFilename))

    E.info("ninput=%i, noutput=%i" %
           (len(data), sum((x.mCounts for x in outputters))))

    E.stop()
Exemplo n.º 8
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: csv_set.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-u",
                      "--unique",
                      dest="unique",
                      action="store_true",
                      help="output rows are uniq.")

    parser.add_option("-1",
                      "--join-fields1",
                      dest="join_fields1",
                      type="string",
                      help="join fields in first table.")
    parser.add_option("-2",
                      "--join-fields2",
                      dest="join_fields2",
                      type="string",
                      help="join fields in second table.")
    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      help="set operation to perform.",
                      choices=("intersection", "rest", "union"))

    parser.set_defaults(
        remove=False,
        unique=False,
        join_fields1=None,
        join_fields2=None,
        method="intersection",
    )

    (options, args) = E.start(parser, add_csv_options=True)

    if len(args) != 2:
        raise ValueError("please specify two files to join")

    if not options.join_fields1 or not options.join_fields2:
        raise ValueError("please specify at least one join field per table")

    options.join_fields1 = options.join_fields1.split(",")
    options.join_fields2 = options.join_fields2.split(",")

    options.filename1, options.filename2 = args

    fields1, table1 = CSV.readTable(open(options.filename1, "r"))
    fields2, table2 = CSV.readTable(open(options.filename2, "r"))

    if options.unique:
        outfile = UniqueBuffer(sys.stdout)
    else:
        outfile = options.stdout

    nfields1 = []
    for x in range(len(fields1)):
        if fields1[x] in options.join_fields1:
            nfields1.append(x)
    nfields2 = []
    for x in range(len(fields2)):
        if fields2[x] in options.join_fields2:
            nfields2.append(x)

    # calculate row indices: double keys are not taken care of here
    keys = {}
    for row1 in table1:
        v = [row1[x] for x in nfields1]
        key = hashlib.md5("".join(v)).digest()
        keys[key] = row1

    if options.method == "intersection":
        # build new field list
        take = list(range(len(fields1)))
        c = len(take)
        for x in fields2:
            if x not in options.join_fields2:
                take.append(c)
            c += 1

        t = fields1 + fields2

        new_fields = [t[x] for x in take]

        print("\t".join(new_fields))

        for row2 in table2:
            v = [row2[x] for x in nfields2]
            key = hashlib.md5("".join(v)).digest()
            if key in keys:
                new_row = keys[key] + row2
                outfile.write("\t".join([new_row[x] for x in take]) + "\n")

    elif options.method == "rest":

        new_fields = fields2
        print("\t".join(new_fields))

        for row2 in table2:
            v = [row2[x] for x in nfields2]
            key = hashlib.md5("".join(v)).digest()
            if key not in keys:
                outfile.write("\t".join(row2) + "\n")

    E.stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: csv_intersection.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="output rows are uniq.")

    parser.set_defaults(
        remove=False,
        unique=False,
    )

    (options, args) = E.start(parser, add_csv_options=True)

    if len(args) != 2:
        raise ValueError("please specify two files to join")

    options.filename1, options.filename2 = args

    table1 = CSV.readTable(IOTools.open_file(options.filename1, "r"))
    table2 = CSV.readTable(IOTools.open_file(options.filename2, "r"))

    if options.unique:
        outfile = UniqueBuffer(sys.stdout)
    else:
        outfile = options.stdout

    # build new field list
    new_fields = []

    for x in options.join_fields1:
        new_fields.append(x)

    for x in fields1:
        if x not in options.join_fields1:
            new_fields.append(x)
        if x not in options.join_fields2:
            new_fields.append(x)

        writer = csv.DictWriter(outfile,
                                fields,
                                dialect=options.csv_dialect,
                                lineterminator=options.csv_lineterminator,
                                extrasaction='ignore')

    if len(lines) > 0:

        old_fields = lines[0][:-1].split("\t")

        if options.remove:
            fields = []
            for x in old_fields:
                if x not in input_fields:
                    fields.append(x)
        else:
            fields = input_fields

        reader = csv.DictReader(lines,
                                dialect=options.csv_dialect)

        print("\t".join(fields))

        first_row = True
        for row in reader:
            row = IOTools.convertDictionary(row)
            writer.writerow(row)

    E.stop()