Пример #1
0
def merge_two_files(A_file, B_file, handle):
    """input two files and merge, write the output to handle"""
    import arrayio
    from genomicode import Matrix
    from genomicode import matrixlib

    M_A = arrayio.read(A_file)
    M_B = arrayio.read(B_file)
    assert arrayio.tab_delimited_format.is_matrix(M_A)
    assert arrayio.tab_delimited_format.is_matrix(M_B)
    [M_A, M_B] = matrixlib.align_rows(M_A, M_B)
    assert M_A.nrow() > 0, 'there is no common genes between two files'
    X = []
    for i in range(M_A.dim()[0]):
        x = M_A._X[i] + M_B._X[i]
        X.append(x)
    row_names = M_A._row_names
    row_order = M_A._row_order
    col_names = {}
    for name in M_A._col_names:
        if name not in M_B._col_names:
            continue
        newsample_list = []
        for sample in M_B._col_names[name]:
            if sample in M_A._col_names[name]:
                newsample = sample + '_2'
            else:
                newsample = sample
            newsample_list.append(newsample)
        #x = M_A._col_names[name] + M_B._col_names[name]
        x = M_A._col_names[name] + newsample_list
        col_names[name] = x
    M_c = Matrix.InMemoryMatrix(X, row_names, col_names, row_order)
    arrayio.tab_delimited_format.write(M_c, handle)
Пример #2
0
def summarize_predictions(file_layout):
    import zipfile
    import arrayio
    from genomicode import archive

    # Load the original dataset.  Should be pathway x sample.
    M_data = arrayio.read(file_layout.DATASET)
    sample_names = M_data.col_names(arrayio.COL_ID)

    # Read the predictions.  Will be a sample x probability matrix.
    M_predict = arrayio.read(file_layout.SELAP_PREDICT)
    assert M_predict.nrow() == len(sample_names)
    num_subgroups = M_predict.ncol()

    # Read the cluster names from the model.
    s2f = archive.unzip_dict(file_layout.SMODEL_ZIP)
    assert "clust.txt" in s2f
    zfile = zipfile.ZipFile(file_layout.SMODEL_ZIP)
    #x = [x.strip() for x in zfile.open(s2f["names.txt"]).readlines()]
    x = zfile.open(s2f["clust.txt"]).readlines()
    assert len(x) == num_subgroups, "I have %d subgroups but %d names." % (
        num_subgroups, len(x))
    clust_names = x

    # Save a subgroup x sample matrix.
    handle = open(file_layout.PREDICTIONS_PCL, 'w')
    x = ["Subgroup"] + sample_names
    x = arrayio.tab_delimited_format._clean_many(x)
    print >> handle, "\t".join(x)
    for i in range(num_subgroups):
        probs = M_predict.value(None, i)
        x = [clust_names[i]] + probs
        x = arrayio.tab_delimited_format._clean_many(map(str, x))
        print >> handle, "\t".join(x)
    handle.close()
Пример #3
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import arrayio
        from genomicode import arraysetlib

        signal_node1, signal_node2 = antecedents
        M1 = arrayio.read(signal_node1.identifier)
        M2 = arrayio.read(signal_node2.identifier)
        samples1 = M1.col_names(arrayio.COL_ID)
        samples2 = M2.col_names(arrayio.COL_ID)

        # Make sure no duplicate sample names.
        samples = samples1 + samples2
        seen = {}
        dups = {}
        for x in samples:
            if x in seen:
                dups[x] = 1
            seen[x] = 1
        dups = sorted(dups)
        assert not dups, "Duplicate sample names: %s" % ", ".join(dups)

        assert signal_node1.data.attributes["contents"] == "class0"
        assert signal_node2.data.attributes["contents"] == "class1"
        x1 = [0] * len(samples1)
        x2 = [1] * len(samples2)
        classes = x1 + x2
        arraysetlib.write_cls_file(outfile, "class0", "class1", classes)
Пример #4
0
def make_model(selap_path, penalty, file_layout, matlab):
    import arrayio
    from genomicode import parselib
    from genomicode import archive
    from genomicode import selap

    print "Generating subgroups with penalty %d." % penalty
    x = selap.selap_make_raw(file_layout.SELAP_DATASET,
                             penalty,
                             matlab_bin=matlab,
                             selap_path=selap_path,
                             outpath=file_layout.SELAP)
    print x

    # Make sure SELAP ran correctly.
    msg = "Missing file.  SELAPver3 did not run correctly."
    assert os.path.exists(file_layout.SELAP_MU), msg
    assert os.path.exists(file_layout.SELAP_SIG), msg
    assert os.path.exists(file_layout.SELAP_PROB), msg

    # Figure out the number of variables and the number of subgroups.
    X = arrayio.read(file_layout.SELAP_MU)
    num_vars, num_subgroups = X.dim()

    # Make the model file.
    opj = os.path.join
    path = file_layout.SMODEL_ZIP.replace(".zip", "")
    if not os.path.exists(path):
        os.mkdir(path)

    # Move over the files generated by SELAP.
    os.rename(file_layout.SELAP_MU, opj(path, "mu.txt"))
    os.rename(file_layout.SELAP_SIG, opj(path, "sig.txt"))
    os.rename(file_layout.SELAP_PROB, opj(path, "prob.txt"))

    # Generate the var.txt file.
    M = arrayio.read(file_layout.DATASET)
    assert M.nrow() == num_vars
    names = M.row_names(arrayio.ROW_ID)
    assert len(names) == num_vars
    handle = open(opj(path, "var.txt"), 'w')
    for x in names:
        print >> handle, x
    handle.close()

    # Generate the clust.txt file.
    # Set the names of the subgroups to a reasonable default.
    x = ["GROUP%s" % x for x in parselib.pretty_range(0, num_subgroups)]
    group_names = x
    handle = open(opj(path, "clust.txt"), 'w')
    for x in group_names:
        print >> handle, x
    handle.close()

    archive.zip_path(path, noclobber=False)
    assert os.path.exists(file_layout.SMODEL_ZIP)
    check_model(file_layout.SMODEL_ZIP)
Пример #5
0
def correlation_for_file(data_file, label_file, gene_num=True):
    """given data_file,label_file and the number of selected
       gene,return a list of select gene name"""
    # obtain the class label
    label, label_line = read_label_file.read(label_file)
    # read the data_file and caculate the correlation
    M = arrayio.read(data_file)
    p = correlation(M, label_line)
    # sort the correlation value and obtain the
    # list of the gene index after sorting
    c = sorted(p, reverse=True)
    sortlist = find_sorted_index(p, c)
    # obtain the gene name in the data_file
    f = open(data_file)
    a = f.read().split('\n')
    index = 0  # for pcl file
    startrows = 2  # for pcl file
    genelist = []
    for i in range(startrows, len(a)):
        genelist.append(a[i].split('\t')[index])
    # get a list of selected gene name
    if gene_num is not True:
        select_genelist = [genelist[i] for i in sortlist[0:gene_num]]
    else:
        select_genelist = [genelist[i] for i in sortlist]
    return select_genelist
Пример #6
0
def summarize_subgroups(outpath, num_analyses, penalties):
    # Count the number of subgroups for each penalty.
    import arrayio

    if not penalties:
        return

    penalty2subgroups = {}
    for penalty in penalties:
        fl = make_file_layout(outpath, num_analyses, penalty)
        M = arrayio.read(fl.GLOBAL_PREDICTIONS_PCL)
        num_subgroups = M.nrow()
        penalty2subgroups[penalty] = num_subgroups

    # Write output, with penalties sorted from big to small.
    penalties = sorted(penalty2subgroups)
    penalties.reverse()
    fl = make_file_layout(outpath, num_analyses, penalties[0])
    handle = open(fl.SUMMARY, 'w')
    x = ["Penalty", "Num Subgroups"]
    print >> handle, "\t".join(x)
    for penalty in penalties:
        num_subgroups = penalty2subgroups[penalty]
        x = penalty, num_subgroups
        print >> handle, "\t".join(map(str, x))
    handle.close()
Пример #7
0
def score_many(jobs, lock=None):
    # Return dict of (matrix_name, gs_name, index, sample) ->
    # GeneSetScore or GeneScore.
    import arrayio

    file2matrix = {}

    results = {}
    for x in jobs:
        (gs_name, pos_genes, neg_genes, matrix_name, matrix_file,
         any_matching_gene_sets) = x
        if matrix_file not in file2matrix:
            x = arrayio.read(matrix_file)
            file2matrix[matrix_file] = x
        MATRIX = file2matrix[matrix_file]
        assert not has_missing_values(MATRIX), \
               "Matrix %s has missing values." % matrix_name
        if pos_genes or neg_genes:
            x = score_gene_set(gs_name,
                               pos_genes,
                               neg_genes,
                               matrix_name,
                               MATRIX,
                               any_matching_gene_sets,
                               lock=lock)
        else:
            assert not (pos_genes == [] and neg_genes == []), \
                   "Empty gene set: %s" % gs_name
            assert pos_genes is None, "Has pos genes: %s" % gs_name
            assert neg_genes is None, "Has neg genes: %s" % gs_name
            x = score_gene(gs_name, matrix_name, MATRIX)
        # TODO: should make sure we don't overwrite previous results.
        results.update(x)
    return results
Пример #8
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        """log the input file"""
        import math
        import arrayio
        from genomicode import filelib
        from genomicode import binreg

        signal_file = in_data.identifier
        filelib.assert_exists_nz(signal_file)

        M = arrayio.read(signal_file)
        assert not binreg.is_logged_array_data(M), 'the file is logged'
        # Change the matrix in place.
        X = M._X
        for i in range(len(X)):
            for j in range(len(X[i])):
                x = X[i][j]
                if x is None:
                    continue
                x = float(x)
                if x < 1:
                    x = 1
                x = math.log(x, 2)
                X[i][j] = x

        M_c = arrayio.convert(M, to_format=arrayio.tab_delimited_format)

        handle = open(outfile, 'w')
        arrayio.tab_delimited_format.write(M_c, handle)
Пример #9
0
def read_matrices(filenames, cache=None):
    """Read a list of matrices and align them.  filenames is a list of
    the matrix files to read.  Returns a tuple where the first element
    is a list of the matrices read, and the second is the aligned
    matrix.

    cache is an optional dictionary of filename to matrix.  This can
    be used to prevent re-reading of matrices.

    """
    import copy
    import arrayio
    import filelib

    for filename in filenames:
        assert filelib.exists(filename), "File not found: %s" % filename

    # Load the files.
    DATA = []
    for filename in filenames:
        if cache is not None and filename in cache:
            x = copy.deepcopy(cache[filename])
        else:
            try:
                x = arrayio.read(filename)
            except (SystemError, KeyboardInterrupt, MemoryError), x:
                raise
            except Exception, x:
                # Can diagnose which file failed here.
                # raise
                raise Exception, "Problem reading %s: %s" % (repr(filename),
                                                             str(x))
            if cache is not None:
                cache[filename] = x
Пример #10
0
def read_gene_expression(filename):
    import os
    import arrayio

    assert os.path.exists(filename)
    M = arrayio.read(filename)
    return M
Пример #11
0
def format_firehose_rsem(filename, output):
    import arrayio

    HYB_REF = "Hybridization REF"
    GENE_ID = "gene_id"
    DATA = arrayio.read(filename)
    assert DATA._row_order == [HYB_REF]
    assert DATA._col_order == ["_SAMPLE_NAME", GENE_ID]
    genes = DATA.row_names(HYB_REF)
    gene_symbols = [None] * len(genes)
    gene_ids = [None] * len(genes)
    for i in range(len(genes)):
        x = genes[i].split("|")
        assert len(x) == 2
        gene_symbol, gene_id = x
        if gene_symbol == "?":
            gene_symbol = ""
        gene_ids[i] = gene_id
        gene_symbols[i] = gene_symbol
    f = file(output, 'w')
    header = ["Gene ID", "Gene Symbol"] + DATA.col_names("_SAMPLE_NAME")
    f.write("\t".join(header) + '\n')
    for i in range(DATA.nrow()):
        x = [gene_ids[i], gene_symbols[i]] + DATA._X[i]
        assert len(x) == len(header)
        f.write("\t".join(map(str, x)) + '\n')
    f.close()
Пример #12
0
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     from genomicode import filelib
     in_data = antecedents
     import arrayio
     f_out = file(outfile, 'w')
     M = arrayio.read(in_data.identifier)
     I_good = []
     #get the percentage of gene filter
     percent = float(user_options['filter_value']) / 100
     for i in range(M.dim()[0]):
         missing_count = 0
         for j in range(M.dim()[1]):
             if M._X[i][j] in [None, 'NA']:
                 missing_count = missing_count + 1
         if float(missing_count) / M.dim()[1] < percent:
             I_good.append(i)
 
     
     M_c = M.matrix(I_good, None)
     arrayio.tab_delimited_format.write(M_c, f_out)
     f_out.close()
     assert filelib.exists_nz(outfile), (
         'the output file %s for gene_filter fails' % outfile
     )
Пример #13
0
def plot_line_keywd(filename, keyword, outfile):
    import arrayio
    from genomicode import mplgraph
    from genomicode import filelib

    M = arrayio.read(filename)
    header = M.row_names()
    label = M._col_names['_SAMPLE_NAME']
    lines = []
    data = []
    legend_name = []
    for i in range(M.dim()[0]):
        if M.row_names(header[1])[i] == keyword:
            data.append(M.slice()[i])
            x = "%s (%s)" % (keyword, M.row_names(header[0])[i])
            legend_name.append(x)
    assert len(data) > 0, 'cannot find the keyword %s in the file %s' % (
        keyword, filename)
    for i in range(len(data)):
        line = [(j, data[i][j]) for j in range(len(data[i]))]
        lines.append(line)
    params = {
        "box_label": label,
        "legend": legend_name,
        "ylim_min": 0,
        "ylabel": "Signal",
        "left": 0.1,
    }
    fig = mplgraph.lineplot(*lines, **params)
    fig.savefig(outfile)
    assert filelib.exists_nz(outfile), 'the plot_line_keywd fails'
Пример #14
0
def plot_hyb_bar(filename, outfile):
    from genomicode import mplgraph
    from genomicode import filelib
    import math
    import numpy

    high = ['ILMN_2038770', 'ILMN_2038769']
    med = ['ILMN_2038768', 'ILMN_2038771']
    low = ['ILMN_1343050', 'ILMN_1343052']
    high_data = []
    med_data = []
    low_data = []
    import arrayio
    M = arrayio.read(filename)
    header = M.row_names()
    for i in range(M.dim()[0]):
        if not M.row_names(header[1])[i] == 'cy3_hyb':
            continue
        if M.row_names(header[0])[i] in high:
            high_data.extend(M.slice()[i])
        if M.row_names(header[0])[i] in med:
            med_data.extend(M.slice()[i])
        if M.row_names(header[0])[i] in low:
            low_data.extend(M.slice()[i])

    mean = [numpy.mean(high_data), numpy.mean(med_data), numpy.mean(low_data)]
    flag = [math.isnan(i) for i in mean]
    assert True not in flag, 'input is not a control file'
    std = [numpy.std(high_data), numpy.std(med_data), numpy.std(low_data)]
    fig = mplgraph.barplot(mean,
                           std,
                           ylabel='Signal',
                           box_label=['high', 'med', 'low'])
    fig.savefig(outfile)
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import math
        from Betsy import read_label_file
        from genomicode import jmath
        import arrayio
        data_node, cls_node = antecedents
        # obtain the class label
        label, label_line, second_line = read_label_file.read(
            cls_node.identifier)
        class_num = len(label)
        assert class_num == 2, 'the number of class is not 2'
        fc = 1
        if 'group_fc_num' in user_options:
            fc = int(user_options['group_fc_num'])

        M = arrayio.read(data_node.identifier)
        first = M.slice(None, label[0][0])
        second = M.slice(None, label[1][0])
        #X = M.slice()
        I_good = []
        for i in range(M.nrow()):
            fold_change = abs(jmath.mean(first[i]) - jmath.mean(second[i]))
            if fold_change >= math.log(fc, 2):
                I_good.append(i)

        assert I_good, 'there is no gene is significant in fold change with 2'
        f = file(outfile, 'w')
        M_c = M.matrix(I_good, None)
        arrayio.tab_delimited_format.write(M_c, f)
        f.close()
Пример #16
0
def t_test_for_file(data_file, label_file, gene_num=True):
    """given data_file,label_file and the number of selected
       gene,return a list of select gene name"""
    # obtain the class label
    label, label_line, second_line = read_label_file.read(label_file)
    class_num = len(label)
    assert class_num == 2, 'the number of class is not 2'
    # read the data_file and caculate the t-test
    M = arrayio.read(data_file)
    first = M.slice(None, label[0][0])
    second = M.slice(None, label[1][0])
    t, p = t_test(first, second)
    # sort the p value and obtain the list of the gene index after sorting
    c = sorted(p)
    sortlist = find_sorted_index(p, c)
    # obtain the gene name in the data_file
    f = open(data_file)
    a = f.read().split('\n')
    index = 0  # for pcl file
    startrows = 2  # for pcl file
    genelist = []
    for i in range(startrows, len(a)):
        genelist.append(a[i].split('\t')[index])
    #get a list of selected gene name
    if gene_num is not True:
        select_genelist = [genelist[i] for i in sortlist[0:gene_num]]
    else:
        select_genelist = [genelist[i] for i in sortlist]
    return select_genelist
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import shiftscalenorm
        import arrayio
        from Betsy import read_label_file
        from genomicode import filelib
        data_node, cls_node = antecedents
        if data_node and cls_node:
            result, label_line, second_line = read_label_file.read(
                cls_node.identifier)
            assert len(
                result) == 2, 'for shiftscale,there should be only 2 classes'
            M = arrayio.read(data_node.identifier)
            index1 = result[0][0]
            index2 = result[1][0]
            M_1 = M.matrix(None, index1)
            M_2 = M.matrix(None, index2)
            M_y = shiftscalenorm.normalize(M_1, M_2)
            for i in range(M_y.dim()[0]):
                for j in range(M_y.dim()[1]):
                    if str(M_y._X[i][j]) == 'nan':
                        M_y._X[i][j] = M_2._X[i][0]
            for j in range(M.nrow()):
                for i in range(len(index1)):
                    M._X[j][index1[i]] = M_y._X[j][i]

            f = file(outfile, 'w')
            arrayio.tab_delimited_format.write(M, f)
            f.close()
            assert filelib.exists_nz(outfile), (
                'the output file %s for shiftscale fails' % outfile)

        return False
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import subprocess
        import arrayio
        from Betsy import module_utils
        from genomicode import filelib
        from genomicode import config
        in_data = antecedents
        bfrm_path = config.bfrmnorm
        bfrm_BIN = module_utils.which(bfrm_path)
        assert bfrm_BIN, 'cannot find the %s' % bfrm_path
        num_factor = 1
        #num_factor = 10
        if 'num_factors' in user_options.keys():
            num_factor = int(user_options['num_factors'])
            assert num_factor >= 1, 'the num_factor should be >=1'
            # What is single_object?
            #M = arrayio.read(single_object.identifier)
            M = arrayio.read(in_data.identifier)
            col_num = M.ncol()
            assert num_factor <= col_num, (
                'the num_factor should be less than %d' % col_num)

        tmp = 'tmp_dir'
        command = [
            'python', bfrm_BIN, in_data.identifier, '-f',
            str(num_factor), '-o', tmp
        ]
        process = subprocess.Popen(command,
                                   shell=False,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
        error_message = process.communicate()[1]
        if error_message:
            raise ValueError(error_message)

        assert filelib.exists_nz(tmp), (
            'the output dir %s for bfrm_normalize fails' % tmp)
        assert filelib.exists_nz(os.path.join(tmp, 'normalized.gct')), (
            'the output gct file for bfrm_normalize fails')
        out = os.path.join(tmp, 'normalized.gct')
        M = arrayio.read(out)
        M_new = arrayio.convert(M, to_format=arrayio.pcl_format)
        f = file(outfile, 'w')
        arrayio.tab_delimited_format.write(M_new, f)
        f.close()
Пример #19
0
def run_bfrm_project(file_layout, bfrm_path, matlab_bin):
    import arrayio
    from genomicode import bfrm
    from genomicode import matlab

    param_file = "parameters.txt"
    model = bfrm.read_clean_model(file_layout.BFRM_MODEL,
                                  param_file=param_file)
    num_factors = len(model["FACTOR_O"])
    assert num_factors, "No latent factors in the BFRM model."
    x = "Projecting %d latent factors onto data set." % num_factors
    if num_factors == 1:
        x = x.replace("factors", "factor")
    print x

    DATA = arrayio.read(file_layout.DATASET)

    bfrm_path = bfrm.find_bfrm_project(bfrm_path)
    assert bfrm_path is not None, "I could not find BFRM_project."
    bfrm_path = os.path.realpath(bfrm_path)

    # Write out the dataset and probe IDs.
    write_bfrm_dataset(file_layout.BFRM_DATASET, DATA)
    write_sample_probe_ids(file_layout.BFRM_SPROBE_IDS, DATA)

    # Write the BFRM model files.
    write_bfrm_files(file_layout.BFRM, file_layout.BFRM_MODEL)

    # Make sure some of the probes are the same.
    pid = [x.strip() for x in open(file_layout.BFRM_PROBE_IDS)]
    pid = [pid[i] for i in model["VariablesIn"]]
    spid = [x.strip() for x in open(file_layout.BFRM_SPROBE_IDS)]
    pid = [x.lower() for x in pid]
    spid = [x.lower() for x in spid]
    intersect = [x for x in pid if x in spid]
    assert intersect, "No common probes between model and data set."
    if len(intersect) < len(pid):
        x = "Warning: model contains %d probe IDs, but only matched " + \
            "%d in data set."
        print x % (len(pid), len(intersect))

    # Run the matlab script.
    lines = []
    w = lines.append
    w("addpath '%s';\n" % bfrm_path)
    w("addpath '%s/bfrm';\n" % bfrm_path)
    w("y = load('%s');\n" % file_layout.BFRM_DATASET)
    w("probeidsSmp = readWordlist('%s');\n" % file_layout.BFRM_SPROBE_IDS)
    w("[af Y sampleids] = getFacScores('%s/', y, probeidsSmp);" %
      file_layout.BFRM)
    w("save('%s', 'af', '-ASCII', '-TABS');\n" % file_layout.BFRM_AF)
    w("save('%s', 'Y', '-ASCII', '-TABS');\n" % file_layout.BFRM_Y)
    script = "".join(lines)
    x = matlab.run(script,
                   matlab_bin=matlab_bin,
                   working_path=file_layout.OUTPATH)
    print x
    sys.stdout.flush()
Пример #20
0
def summarize_heatmap(python, arrayplot, cluster, file_layout, libpath=[]):
    import arrayio
    from genomicode import graphlib

    M_predict = arrayio.read(file_layout.PREDICTIONS_PCL)
    nrow, ncol = M_predict.dim()

    # Set the size of the plot.
    x = graphlib.find_wide_heatmap_size(nrow,
                                        ncol,
                                        min_box_width=12,
                                        min_box_height=12,
                                        height_width_ratio=nrow * 1.618 / ncol)
    xpix, ypix = x

    x = graphlib.plot_heatmap(file_layout.PREDICTIONS_PCL,
                              file_layout.PREDICTIONS_PNG,
                              xpix,
                              ypix,
                              color="bild",
                              show_colorbar=True,
                              show_grid=True,
                              scale=-0.5,
                              gain=1.5,
                              no_autoscale=True,
                              gene_label=True,
                              array_label=True,
                              cluster_arrays=True,
                              python=python,
                              arrayplot=arrayplot,
                              cluster=cluster,
                              libpath=libpath)
    print x

    # If arrayplot generated predictions.cdt file, remove it.
    # Actually, don't remove it.  It might be required if people want
    # to plot it themselves with other plotting software.  Maybe can
    # move it to the attic.  There may also be a predictions.atr file.
    #if os.path.exists(file_layout.PREDICTIONS_CDT):
    #    os.unlink(file_layout.PREDICTIONS_CDT)

    # Clean up some of the cluster files.
    if os.path.exists(file_layout.PREDICTIONS_CDT):
        src = file_layout.PREDICTIONS_CDT
        x = os.path.split(file_layout.PREDICTIONS_CDT)[1]
        dst = os.path.join(file_layout.ATTIC, x)
        os.rename(src, dst)
    if os.path.exists(file_layout.PREDICTIONS_ATR):
        src = file_layout.PREDICTIONS_ATR
        x = os.path.split(file_layout.PREDICTIONS_ATR)[1]
        dst = os.path.join(file_layout.ATTIC, x)
        os.rename(src, dst)

    # Make sure the signature was generated correctly.  An error could
    # mean that arrayplot.py or cluster is missing.
    assert os.path.exists(file_layout.PREDICTIONS_PNG), \
           "Failed to make predictions heatmap."
Пример #21
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        """check an input file is xls or xlsx format"""
        import arrayio

        in_filename = in_data.identifier
        # Why is this necessary?
        #try:
        #    x = userfile._unhash_storefile(in_data.identifier)
        #    real_name = x[1]
        #except:
        #    pass

        #if (in_data.identifier.endswith('.gz') or in_filename.endswith('.gz')):
        #    unzip_file = module_utils.gunzip(in_data.identifier)
        #else:
        #    unzip_file = in_data.identifier

        ## M = None
        ## xls_file = None
        ## txt_file = unzip_file
        ## try:
        ##     xlrd.open_workbook(unzip_file)
        ##     xls_file = 'tmp.xls'
        ## # XLRDError?  Is this a bug?  This is not the way to catch exception.
        ## except Exception, XLRDError:
        ##     try:
        ##         # Test this.  book not used?
        ##         book = openpyxl.load_workbook(unzip_file)
        ##         xls_file = 'tmp.xlsx'
        ##     except Exception, InvalidFileException:
        ##         xls_file = None
        ##     except (SystemError, MemoryError, KeyError), x:
        ##         raise

        ## if xls_file:
        ##     shutil.copyfile(unzip_file, xls_file)
        ##     xls2txt_path = config.xls2txt
        ##     xls2txt_BIN = module_utils.which(xls2txt_path)
        ##     assert xls2txt_BIN, 'cannot find the %s' % xls2txt_path
        ##     f = file('tmp1.txt', 'w')
        ##     command = ['python', xls2txt_BIN, xls_file]
        ##     process = subprocess.Popen(command,
        ##                                shell=False,
        ##                                stdout=f,
        ##                                stderr=subprocess.PIPE)
        ##     error_message = process.communicate()[1]
        ##     if error_message:
        ##         raise ValueError(error_message)
        ##     os.remove(xls_file)
        ##     f.close()
        ##     txt_file = 'tmp1.txt'

        to_format = arrayio.tdf
        MATRIX = arrayio.read(in_filename)
        MATRIX_c = arrayio.convert(MATRIX, to_format=to_format)
        to_format.write(MATRIX_c, open(outfile, 'w'))
Пример #22
0
def plot_line_keywds(filename, keywords, outfile):
    import arrayio
    from genomicode import mplgraph
    from genomicode import filelib

    M = arrayio.read(filename)
    header = M.row_names()
    label = M._col_names['_SAMPLE_NAME']
    outfiles = []
    for keyword in keywords:
        out = keyword + '.png'
        lines = []
        data = []
        legend_name = []
        for i in range(M.dim()[0]):
            if M.row_names(header[1])[i] == keyword:
                data.append(M.slice()[i])
                legend_name.append(M.row_names(header[0])[i])
        assert len(data) > 0, 'cannot find the keywords %s in the file %s' % (
            keywords, filename)
        for i in range(len(data)):
            line = [(j, data[i][j]) for j in range(len(data[i]))]
            lines.append(line)
        params = {
            "box_label": label,
            "legend": legend_name,
            "ylim_min": 0,
            "ylabel": keyword,
            "left": 0.1,
        }
        fig = mplgraph.lineplot(*lines, **params)
        fig.savefig(out)
        outfiles.append(out)
    import Image
    img_w_list = []
    img_h_list = []
    imgs = []
    for i in range(len(outfiles)):
        img = Image.open(outfiles[i], 'r')
        img_w, img_h = img.size
        img_w_list.append(img_w)
        img_h_list.append(img_h)
        imgs.append(img)
    total_w = max(img_w_list) + 30
    total_h = sum(img_h_list) + 10
    background = Image.new('RGBA', (total_w, total_h), (255, 255, 255, 255))
    bg_w, bg_h = background.size
    offset_w = (bg_w - max(img_w_list)) / 2
    offset_h_list = []
    for i in range(len(img_h_list)):
        offset_h = bg_h - sum(img_h_list[i:])
        offset_h_list.append(offset_h)
    for img, offset_h in zip(imgs, offset_h_list):
        background.paste(img, (offset_w, offset_h))
    background.save(outfile)
    assert filelib.exists_nz(outfile), 'the plot_line_keywds fails'
Пример #23
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import mplgraph
        import arrayio
        from genomicode import jmath
        from genomicode import arrayplatformlib
        from genomicode import filelib
        in_data = antecedents
        M = arrayio.read(in_data.identifier)
        platforms = arrayplatformlib.identify_all_platforms_of_matrix(M)
        id_ = platforms[0][0]
        platform = platforms[0][1]
        if platform:
            if platform in [
                    'HumanHT_12', 'MouseRef_8', 'HumanHT_12_control',
                    'MouseRef_8_control', 'entrez_ID_human', 'entrez_ID_mouse',
                    'entrez_symbol_human', 'entrez_symbol_mouse'
            ]:
                import matplotlib.pyplot as plt
                plt.clf()
                plt.plot([0, 0, 0, 0])
                plt.title('no AFFX plot can be generated')
                plt.savefig(outfile)

            else:
                M = arrayio.read(in_data.identifier)
                label = M._col_names['_SAMPLE_NAME']
                row_names = M._row_names[id_]
                index = []
                for i, name in enumerate(row_names):
                    if name.startswith('AFFX-'):
                        index.append(i)
                M_new = M.matrix(index)
                new = M_new.slice()
                a = jmath.mean_matrix(new, byrow=None)
                line = [(i, a[i]) for i in range(len(a))]
                f = mplgraph.lineplot(line,
                                      ylim_min=0,
                                      ylabel='Gene Expression Value',
                                      box_label=label)
                f.savefig(outfile)
            assert filelib.exists_nz(outfile), (
                'the output file %s for plot_affy_affx_line fails' % outfile)
Пример #24
0
def convert_to_same_platform(filename1, filename2, platform=None):
    import arrayio
    import subprocess
    from genomicode import config
    from genomicode import arrayplatformlib
    from genomicode import filelib

    M1 = arrayio.read(filename1)
    platform1 = arrayplatformlib.identify_platform_of_matrix(M1)
    M2 = arrayio.read(filename2)
    platform2 = arrayplatformlib.identify_platform_of_matrix(M2)
    if platform1 == platform2:
        return filename1, filename2

    Annot_path = config.annotate_matrix
    Annot_BIN = filelib.which(Annot_path)
    assert Annot_BIN, 'cannot find the %s' % Annot_path
    if platform1 == platform:
        filename = filename2
        newfilename1 = filename1
        newfilename2 = 'tmp'
    elif platform2 == platform:
        filename = filename1
        newfilename1 = 'tmp'
        newfilename2 = filename2

    if platform:
        command = [
            'python', Annot_BIN, '-f', filename, '-o', 'tmp', "--platform",
            platform
        ]
        process = subprocess.Popen(command,
                                   shell=False,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
        error_message = process.communicate()[1]
        if error_message:
            raise ValueError(error_message)
        #assert module_utils.exists_nz('tmp'), (
        #    'the platform conversion fails')
        assert filelib.exists_nz('tmp'), 'the platform conversion fails'
    return newfilename1, newfilename2
Пример #25
0
    def set_out_attributes(self, antecedents, out_attributes):
        import arrayio
        from genomicode import binreg

        attrs = out_attributes.copy()
        M = arrayio.read(antecedents.identifier)
        if binreg.is_logged_array_data(M):
            attrs['logged'] = 'yes'
        else:
            attrs['logged'] = 'no'
        return attrs
Пример #26
0
def summarize_filtered_genes(file_layout):
    # Select the <NUM_FILTERED_GENES> genes that vary most by variance.
    import arrayio
    from genomicode import matrixlib
    from genomicode import pcalib

    DATA_orig = arrayio.read(file_layout.DS_PROC)
    DATA_final = arrayio.read(file_layout.DS_FINAL)
    if not matrixlib.are_rows_aligned(DATA_orig, DATA_final):
        assert False, matrixlib.describe_unaligned_rows(DATA_orig, DATA_final)

    # Select the genes with the greatest variance.
    I = pcalib.select_genes_var(DATA_orig._X, NUM_FILTERED_GENES)
    DATA_orig = DATA_orig.matrix(I, None)
    DATA_final = DATA_final.matrix(I, None)

    arrayio.gct_format.write(DATA_orig, open(file_layout.DS_PROC_FILTERED,
                                             'w'))
    arrayio.gct_format.write(DATA_final,
                             open(file_layout.DS_FINAL_FILTERED, 'w'))
Пример #27
0
 def set_out_attributes(self, antecedents, out_attributes):
     import arrayio
     new_parameters = out_attributes.copy()
     M = arrayio.read(antecedents.identifier)
     if is_gene_normalize_variance(M):
         new_parameters['gene_normalize'] = 'variance'
     elif is_gene_normalize_ss(M):
         new_parameters['gene_normalize'] = 'sum_of_squares'
     else:
         new_parameters['gene_normalize'] = 'no'
     
     return new_parameters
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     from genomicode import filelib
     import os
     import arrayio
     from genomicode import config
     from genomicode import arrayplatformlib
     in_data = antecedents
     mapfile = config.HumanHT_12_to_HG_u133_Plus_2
     assert os.path.exists(mapfile), 'mapping file %s does not exist' % mapfile
     result = []
     for d in filelib.read_row(mapfile, header=True):
         if int(d.Distance) <= 1000 and d.Match == 'Best for Both':
             result.append((d.Affymetrix_Probe_Set_ID, d.Illumina_Probe_ID))
 
     
     
     M = arrayio.read(in_data.identifier)
     #platform_list = arrayplatformlib.identify_all_platforms_of_matrix(M)
     platform_list = arrayplatformlib.score_all_platforms_of_matrix(M)
     illu_id = None
     probe_id = None
     for platform in platform_list:
         if 'HumanHT_12' in platform:
             illu_id = M._row_names[platform[0]]
         if 'HG_U133_Plus_2' in platform:
             probe_id = M._row_names[platform[0]]
 
     
     
     if not illu_id or not probe_id:
         return None
 
     
     
     index = []
     for i in range(M.nrow()):
         if (probe_id[i], illu_id[i]) in result:
             index.append(i)
 
     
     
     if len(index) > 0:
         M_new = M.matrix(index, None)
         f = file(outfile, 'w')
         arrayio.tab_delimited_format.write(M_new, f)
         f.close()
         assert filelib.exists_nz(outfile), (
             'the output file %s for best_match_both fails' % outfile
         )
     else:
         return None
Пример #29
0
 def set_out_attributes(self, antecedents, out_attributes):
     import arrayio
     new_parameters = out_attributes.copy()
     M = arrayio.read(antecedents.identifier)
     if is_gene_center_mean(M):
         new_parameters['gene_center'] = 'mean'
     elif is_gene_center_median(M):
         new_parameters['gene_center'] = 'median'
     else:
         new_parameters['gene_center'] = 'no'
     
     return new_parameters
Пример #30
0
def is_missing(identifier):
    import arrayio

    M = arrayio.read(identifier)
    has_missing = False
    for i in range(M.dim()[0]):
        for j in range(M.dim()[1]):
            if M._X[i][j] is None:
                has_missing = True
                break
        if has_missing:
            break
    return has_missing