Exemplo n.º 1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: matrix2matrix.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=(
                          "normalize-by-min-diagonal",
                          "normalize-by-column",
                          "log",
                          "ln",
                          "negzero2value",
                          "set-diagonal",
                          "subtract-matrix",
                          "mix-matrix",
                          "normalize-by-matrix",
                          "normalize-by-column-max",
                          "normalize-by-row-max",
                          "normalize-by-column-min",
                          "normalize-by-row-min",
                          "normalize-by-column-median",
                          "normalize-by-row-median",
                          "normalize-by-column-mean",
                          "normalize-by-row-mean",
                          "normalize-by-column-total",
                          "normalize-by-row-total",
                          "correspondence-analysis",
                          "normalize-by-value",
                          "add-value",
                          "sort-rows",
                          "sort-columns",
                          "transpose",
                          "upper-bound",
                          "lower-bound",
                          "subtract-first-col",
                          "multiply-by-value",
                          "divide-by-value",
                          "mask-rows",
                          "mask-columns",
                          "mask-rows-and-columns",
                          "symmetrize-mean",
                          "symmetrize-max",
                          "symmetrize-min",
                      ),
                      help="""method to use [default=%default]""")

    parser.add_option("-s",
                      "--scale",
                      dest="scale",
                      type="float",
                      help="factor to scale matrix by [default=%default].")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="output number format [default=%default].")

    parser.add_option("--filename-rows",
                      dest="filename_rows",
                      type="string",
                      help="filename with rows to mask [default=%default].")

    parser.add_option("--filename-columns",
                      dest="filename_columns",
                      type="string",
                      help="filename with columns to mask [default=%default].")

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="Parameters for various functions.")

    parser.add_option("-t",
                      "--headers",
                      dest="headers",
                      action="store_true",
                      help="matrix has row/column headers.")

    parser.add_option("--no-headers",
                      dest="headers",
                      action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option("-a",
                      "--value",
                      dest="value",
                      type="float",
                      help="value to use for various algorithms.")

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("full", "sparse", "phylip"),
                      help="""input format for matrix.""")

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("full", "sparse", "phylip"),
                      help="""output format for matrix.""")

    parser.add_option(
        "--missing",
        dest="missing",
        type="float",
        help=
        "value to use for missing values. If not set, missing values will cause the script to fail [default=%default]."
    )

    parser.set_defaults(
        methods=[],
        scale=1.0,
        headers=True,
        format="%6.4f",
        output_format="full",
        input_format="full",
        value=0.0,
        parameters="",
        write_separators=True,
        filename_rows=None,
        filename_columns=None,
        missing=None,
    )

    (options, args) = E.Start(parser)

    options.parameters = options.parameters.split(",")

    lines = filter(lambda x: x[0] != "#", sys.stdin.readlines())

    if len(lines) == 0:
        raise IOError("no input")

    chunks = filter(lambda x: lines[x][0] == ">", range(len(lines)))

    if not chunks:
        options.write_separators = False
        chunks = [-1]

    chunks.append(len(lines))

    if options.filename_rows:
        row_names, n = IOTools.ReadList(open(options.filename_rows, "r"))
    if options.filename_columns:
        column_names, n = IOTools.ReadList(open(options.filename_columns, "r"))

    for chunk in range(len(chunks) - 1):

        try:
            raw_matrix, row_headers, col_headers = MatlabTools.readMatrix(
                StringIO.StringIO("".join(lines[chunks[chunk] +
                                                1:chunks[chunk + 1]])),
                format=options.input_format,
                headers=options.headers,
                missing=options.missing)
        except ValueError, msg:
            E.warn("matrix could not be read: %s" % msg)
            continue

        nrows, ncols = raw_matrix.shape

        E.debug("read matrix: %i x %i, %i row titles, %i colum titles" %
                (nrows, ncols, len(row_headers), len(col_headers)))

        parameter = 0

        for method in options.methods:

            matrix = numpy.reshape(numpy.array(raw_matrix), raw_matrix.shape)

            if method in ("normalize-by-matrix", "subtract-matrix",
                          "mix-matrix", "add-matrix"):

                other_matrix, other_row_headers, other_col_headers = MatlabTools.ReadMatrix(
                    open(options.parameters[parameter], "r"),
                    headers=options.headers)

                other_nrows, other_ncols = other_matrix.shape

                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# read second matrix from %s: %i x %i, %i row titles, %i colum titles.\n"
                        % (options.parameters[parameter], other_nrows,
                           other_ncols, len(other_row_headers),
                           len(other_col_headers)))

                parameter += 1

            elif method == "normalize-by-min-diagonal":
                for x in range(nrows):
                    for y in range(ncols):
                        m = min(raw_matrix[x, x], raw_matrix[y, y])
                        if m > 0:
                            matrix[x, y] = raw_matrix[x, y] / m

            elif method == "normalize-by-column":
                if nrows != ncols:
                    raise "only supported for symmeric matrices."

                for x in range(nrows):
                    for y in range(ncols):
                        if raw_matrix[y, y] > 0:
                            matrix[x, y] = raw_matrix[x, y] / raw_matrix[y, y]

            elif method == "normalize-by-value":
                matrix = raw_matrix / float(options.parameters[parameter])
                parameter += 1

            elif method == "normalize-by-row":
                if nrows != ncols:
                    raise "only supported for symmeric matrices."

                for x in range(nrows):
                    for y in range(ncols):
                        if raw_matrix[y, y] > 0:
                            matrix[x, y] = raw_matrix[x, y] / raw_matrix[x, x]

            elif method == "subtract-first-col":
                for x in range(nrows):
                    for y in range(ncols):
                        matrix[x, y] -= raw_matrix[x, 0]

            elif method.startswith("normalize-by-column"):
                if method.endswith("max"):
                    f = max
                elif method.endswith("min"):
                    f = min
                elif method.endswith("median"):
                    f = scipy.median
                elif method.endswith("mean"):
                    f = scipy.mean
                elif method.endswith("total"):
                    f = sum

                for y in range(ncols):
                    m = f(matrix[:, y])
                    if m != 0:
                        for x in range(nrows):
                            matrix[x, y] = matrix[x, y] / m

            elif method.startswith("normalize-by-row"):
                if method.endswith("max"):
                    f = max
                elif method.endswith("min"):
                    f = min
                elif method.endswith("median"):
                    f = scipy.median
                elif method.endswith("mean"):
                    f = scipy.mean
                elif method.endswith("total"):
                    f = sum

                for x in range(nrows):
                    m = f(matrix[x, :])
                    if m != 0:
                        for y in range(ncols):
                            matrix[x, y] = raw_matrix[x, y] / m

            elif method == "negzero2value":
                # set zero/negative values to a value
                for x in range(nrows):
                    for y in range(ncols):
                        if matrix[x, y] <= 0:
                            matrix[x, y] = options.value

            elif method == "minmax":
                # set zero/negative values to a value
                for x in range(nrows):
                    for y in range(ncols):
                        matrix[x, y], matrix[y, x] = \
                            min(matrix[x, y], matrix[y, x]), \
                            max(matrix[x, y], matrix[y, x])

            elif method == "log":
                # apply log to all values.
                for x in range(nrows):
                    for y in range(ncols):
                        if matrix[x, y] > 0:
                            matrix[x, y] = math.log10(matrix[x, y])

            elif method == "ln":
                for x in range(nrows):
                    for y in range(ncols):
                        if matrix[x, y] > 0:
                            matrix[x, y] = math.log(matrix[x, y])

            elif method == "transpose":
                matrix = numpy.transpose(matrix)
                row_headers, col_headers = col_headers, row_headers
                nrows, ncols = ncols, nrows

            elif method == "mul":
                matrix = numpy.dot(matrix, numpy.transpose(matrix))
                col_headers = row_headers

            elif method == "multiply-by-value":
                matrix *= options.value

            elif method == "divide-by-value":
                matrix /= options.value

            elif method == "add-value":
                matrix += options.value

            elif method == "angle":
                # write angles between col vectors
                v1 = numpy.sqrt(numpy.sum(numpy.power(matrix, 2), 0))
                matrix = numpy.dot(numpy.transpose(matrix), matrix)
                row_headers = col_headers
                nrows = ncols
                for x in range(nrows):
                    for y in range(ncols):
                        matrix[x, y] /= v1[x] * v1[y]

            elif method == "euclid":
                # convert to euclidean distance matrix
                matrix = numpy.zeros((ncols, ncols), numpy.float)
                for c1 in range(0, ncols - 1):
                    for c2 in range(c1 + 1, ncols):
                        for r in range(0, nrows):
                            d = raw_matrix[r][c1] - raw_matrix[r][c2]
                            matrix[c1, c2] += (d * d)
                        matrix[c2, c1] = matrix[c1, c2]
                matrix = numpy.sqrt(matrix)
                row_headers = col_headers
                nrows = ncols

            elif method.startswith("symmetrize"):
                f = method.split("-")[1]
                if f == "max":
                    f = max
                elif f == "min":
                    f = min
                elif f == "mean":
                    f = lambda x, y: float(x + y) / 2

                if nrows != ncols:
                    raise ValueError(
                        "symmetrize only available for symmetric matrices")
                if row_headers != col_headers:
                    raise ValueError(
                        "symmetrize not available for permuted matrices")
                for x in range(nrows):
                    for y in range(ncols):
                        matrix[x, y] = matrix[y,
                                              x] = f(matrix[x, y], matrix[y,
                                                                          x])
            elif method == "sub":
                matrix = options.value - matrix

            elif method in ("lower-bound", "upper-bound"):

                boundary = float(options.parameters[parameter])
                new_value = float(options.parameters[parameter + 1])
                parameter += 2
                if method == "upper-bound":
                    for x in range(nrows):
                        for y in range(ncols):
                            if matrix[x, y] > boundary:
                                matrix[x, y] = new_value
                else:
                    for x in range(nrows):
                        for y in range(ncols):
                            if matrix[x, y] < boundary:
                                matrix[x, y] = new_value

            elif method == "subtract-matrix":
                matrix = matrix - other_matrix

            elif method == "add-matrix":
                matrix = matrix + other_matrix

            elif method == "normalize-by-matrix":

                # set 0s to 1 in the other matrix
                for x in range(nrows):
                    for y in range(ncols):
                        if other_matrix[x, y] == 0:
                            other_matrix[x, y] = 1.0

                matrix = matrix / other_matrix

            elif method == "mix-matrix":
                for x in range(len(other_row_headers) - 1):
                    for y in range(x + 1, len(other_col_headers)):
                        matrix[x, y] = other_matrix[x, y]

            elif method == "set-diagonal":
                value = float(options.parameters[parameter])
                for x in range(min(nrows, ncols)):
                    matrix[x, x] = value
                parameter += 1

            elif method == "transpose":
                matrix = numpy.transpose(raw_matrix)
                row_headers, col_headers = col_headers, row_headers

            elif method == "correspondence-analysis":
                row_indices, col_indices = CorrespondenceAnalysis.GetIndices(
                    raw_matrix)
                map_row_new2old = numpy.argsort(row_indices)
                map_col_new2old = numpy.argsort(col_indices)

                matrix, row_headers, col_headers = CorrespondenceAnalysis.GetPermutatedMatrix(
                    raw_matrix,
                    map_row_new2old,
                    map_col_new2old,
                    row_headers=row_headers,
                    col_headers=col_headers)

            elif method == "mask-rows":
                r = set(row_names)
                for x in range(len(row_headers)):
                    if row_headers[x] in r:
                        matrix[x, :] = options.value

            elif method == "mask-columns":
                r = set(column_names)
                for x in range(len(col_headers)):
                    if col_headers[x] in r:
                        matrix[:, x] = options.value

            elif method == "mask-rows-and-columns":

                r = set(row_names)
                c = set(column_names)
                for x in range(len(row_headers)):
                    for y in range(len(col_headers)):
                        if row_headers[x] in r and col_headers[y] in c:
                            matrix[x, y] = options.value

            raw_matrix = numpy.reshape(numpy.array(matrix), matrix.shape)

        else:
            # for simple re-formatting jobs
            matrix = raw_matrix

        if options.write_separators:
            options.stdout.write(lines[chunks[chunk]])

        MatlabTools.writeMatrix(sys.stdout,
                                matrix,
                                value_format=options.format,
                                format=options.output_format,
                                row_headers=row_headers,
                                col_headers=col_headers)
Exemplo n.º 2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id")

    parser.add_option("-e",
                      "--header-names",
                      dest="headers",
                      action="store_true",
                      help="first row is a header [ignored].")
    parser.add_option("-t",
                      "--title",
                      dest="title",
                      type="string",
                      help="page title.")
    parser.add_option("-f",
                      "--footer",
                      dest="footer",
                      type="string",
                      help="page footer.")
    parser.add_option("--maxP",
                      dest="max_pvalue",
                      type="float",
                      help="maximum P-value displayed [default=%default].")
    parser.add_option(
        "--maxQ",
        dest="max_qvalue",
        type="float",
        help="maximum Q-value for controlling for FDR [default=%default].")
    parser.add_option(
        "-c",
        "--column-titles",
        dest="col_names",
        type="string",
        help="comma separated list of column titles [default: use filenames].")
    parser.add_option("-p",
                      "--pattern-filename",
                      dest="pattern_filename",
                      type="string",
                      help="pattern to map columns to filename.")
    parser.add_option("-A",
                      "--Annotator",
                      dest="annotator",
                      action="store_true",
                      help="use Annotator-style input files.")
    parser.add_option(
        "--annotator-fdr",
        dest="annotator_fdr",
        action="store_true",
        help="use fdr computed from annotator [default=%default].")
    parser.add_option("-T",
                      "--thresholds",
                      dest="thresholds",
                      type="string",
                      help="7 comma-separated fold-change threshold values")
    parser.add_option("-P",
                      "--pvalues",
                      dest="pvalues",
                      type="string",
                      help="6 comma-separated p value threshold values"),
    parser.add_option("-C",
                      "--altcolours",
                      dest="altcolours",
                      action="store_true",
                      help="Use alternative colour palette")
    parser.add_option("-X",
                      "--delimiters",
                      dest="delims",
                      type="string",
                      help="Delimiter characters for annotation label")
    parser.add_option("-Z",
                      "--ignore",
                      dest="ignore",
                      type="string",
                      help="Ignored characters in annotation label")
    parser.add_option(
        "--fdr",
        dest="fdr",
        type="float",
        help=
        "filter output by FDR (requires annotator output). [default=%default]")
    parser.add_option("-a",
                      "--template",
                      dest="template",
                      type="choice",
                      choices=("screen", "publication"),
                      help="layout template to choose - affects colours.")
    parser.add_option(
        "--sort-columns",
        dest="sort_columns",
        type="choice",
        choices=(
            "unsorted",
            "similarity",
            "alphabetical",
        ),
        help=
        "sort columns. The default, unsorted, list columns in the order that they are supplied on the command line [default=%default]"
    )

    parser.set_defaults(
        sortAlphabetically=True,
        headers=False,
        col_names="",
        pattern_filename=None,
        title="",
        footer="",
        max_pvalue=None,
        max_qvalue=None,
        annotator=False,
        thresholds="0.25,0.33,0.5,1.0,2.0,3.0,4.0",
        pvalues="0.00001,0.0001,0.001,0.01,0.1",
        altcolours=False,
        delims="",
        ignore="",
        template="screen",
        annotator_fdr=False,
        fdr=None,
        sort_columns="unsorted",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) == 0:
        raise IOError("Please supply at least one input file.")

    if options.pattern_filename:
        input = []
        col_names = args
        for x in col_names:
            input.append(options.pattern_filename % x)
    else:
        input = args

        if options.col_names:
            col_names = options.col_names.split(",")
            if len(col_names) != len(input):
                raise ValueError(
                    "Number of col_names and files different: %i != %i" %
                    (len(col_names), len(input)))
        else:
            col_names = input

    E.info("reading data for %i columns" % len(input))

    columns = []
    errors = []

    for col_name, filename in zip(col_names, input):

        E.debug("reading data for column %s from %s " % (col_name, filename))
        # collect all columns
        try:
            values, nremoved, no_fdr = Collect(
                IOTools.openFile(filename, "r"),
                with_headers=options.headers,
                annotator_format=options.annotator,
                delims=options.delims,
                ignore=options.ignore,
                use_annotator_fdr=options.annotator_fdr,
                max_pvalue=options.max_pvalue,
                max_qvalue=options.max_qvalue)
        except IOError:
            E.warn("no data from %s" % filename)
            values = []
            no_fdr = False
            nremoved = 0

        E.info("read %i values from %s: %i significant, %i removed" %
               (len(values) + nremoved, filename, len(values), nremoved))
        columns.append((col_name, values))
        errors.append(no_fdr)

    if sum([len(x) for x in columns]) == 0:
        raise IOError("no data read - please check supplied files.")

    # collect all annotations
    # Also filter for max pvalue
    annotations = set()
    for col_name, column in columns:
        for d in column:
            annotations.add(d.mAnnotation)

    E.info("There are %i rows" % len(annotations))

    # sort and filter annotations
    # (Code removed which did some filtering; the annotations data is not used)
    # By removing labels from annlist you can select the annotations you want
    # to display
    row_names = list(annotations)
    if options.sortAlphabetically:
        row_names.sort()

    if options.sort_columns == "unsorted":
        pass
    elif options.sort_columns == "alphabetical":
        col_names.sort()
    elif options.sort_columns == "similarity":
        if len(row_names) * len(col_names) > 10000:
            E.info("no sorting as matrix too large")
        else:
            matrix = numpy.ones((len(row_names), len(col_names)), numpy.float)
            map_rows = dict(list(zip(row_names, list(range(len(row_names))))))
            x = 0
            for col_name, column in columns:
                for d in column:
                    matrix[map_rows[d.mAnnotation], x] = d.mFoldChange
                x += 1
            row_indices, col_indices = CorrespondenceAnalysis.GetIndices(
                matrix)
            map_row_new2old = numpy.argsort(row_indices)
            map_col_new2old = numpy.argsort(col_indices)
            row_names = [
                row_names[map_row_new2old[x]] for x in range(len(row_names))
            ]
            col_names = [
                col_names[map_col_new2old[x]] for x in range(len(col_names))
            ]

    E.info("columns have been sorted")

    plot = GoPlot(row_names,
                  col_names,
                  thresholds_size=tuple(map(float,
                                            options.pvalues.split(','))),
                  thresholds_colour=tuple(
                      map(float, options.thresholds.split(','))),
                  template=options.template,
                  alt_colours=options.altcolours,
                  max_pvalue=options.max_pvalue,
                  max_qvalue=options.max_qvalue,
                  mark_columns=errors)

    if options.title:
        plot.setTitle(options.title)
    if options.footer:
        plot.setFooter(options.footer)

    plot.initializePlot()

    for col_name, column in columns:
        for d in column:
            plot.addValue(d.mAnnotation, col_name, d.mPValue, d.mFoldChange)

    plot.writeToFile(options.stdout)

    E.Stop()