示例#1
0
def center_scores(scores, batches, phenotypes, group1, group2):
    from genomicode import jmath
    
    assert len(scores) == len(phenotypes)
    assert len(batches) == len(phenotypes)
    batches_all = sorted({}.fromkeys(batches))
    scores_c = [None] * len(scores)
    for batch in batches_all:
        I = [i for i in range(len(batches)) if batches[i] == batch]
        
        scores1, scores2 = [], []
        for i in I:
            pheno = phenotypes[i]
            if pheno in group1:
                scores1.append(scores[i])
            elif pheno in group2:
                scores2.append(scores[i])
            else:
                raise AssertionError, "%s not in groups" % pheno
        assert scores1, "No samples from group1 in batch %s" % batch
        assert scores2, "No samples from group2 in batch %s" % batch

        mean1 = jmath.mean(scores1)
        mean2 = jmath.mean(scores2)
        n = (mean1 + mean2)/2.0
        for i in I:
            scores_c[i] = scores[i] - n
    assert None not in scores_c
    return scores_c
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import math
        from Betsy import read_label_file
        from genomicode import jmath
        import arrayio
        data_node, cls_node = antecedents
        # obtain the class label
        label, label_line, second_line = read_label_file.read(
            cls_node.identifier)
        class_num = len(label)
        assert class_num == 2, 'the number of class is not 2'
        fc = 1
        if 'group_fc_num' in user_options:
            fc = int(user_options['group_fc_num'])

        M = arrayio.read(data_node.identifier)
        first = M.slice(None, label[0][0])
        second = M.slice(None, label[1][0])
        #X = M.slice()
        I_good = []
        for i in range(M.nrow()):
            fold_change = abs(jmath.mean(first[i]) - jmath.mean(second[i]))
            if fold_change >= math.log(fc, 2):
                I_good.append(i)

        assert I_good, 'there is no gene is significant in fold change with 2'
        f = file(outfile, 'w')
        M_c = M.matrix(I_good, None)
        arrayio.tab_delimited_format.write(M_c, f)
        f.close()
示例#3
0
def create_control_vars(MATRIX, num_control_vars):
    import numpy
    from genomicode import jmath
    from genomicode import bfrm

    # Look for the annotations that resemble affymetrix probe set IDs.
    affx_name = bfrm.get_affy_row_name(MATRIX)

    # Select the affymetrix control variables.
    ids = MATRIX.row_names(affx_name)
    I = [i for (i, id_) in enumerate(ids) if bfrm.is_affx(id_)]
    assert I
    AFFX = MATRIX.matrix(I, None)
    max_control_vars = min(AFFX.nrow(), AFFX.ncol())
    assert num_control_vars<= AFFX.nrow() and num_control_vars <= AFFX.ncol(),\
           "Too many control variables (%d).  Maximum is %d." % (
               num_control_vars, max_control_vars)

    # Calculate the SVD of the control probes.
    X = AFFX._X
    # Subtract the means from each gene.
    for i in range(len(X)):
        x = X[i]
        m = jmath.mean(x)
        x = [x - m for x in x]
        X[i] = x
    # Calculate the SVD.
    U, s, V = numpy.linalg.svd(X, full_matrices=False)
    # Each row of V is a control factor.
    CONTROL = V.tolist()[:num_control_vars]
    assert len(CONTROL) == num_control_vars
    assert len(CONTROL[0]) == AFFX.ncol()

    return CONTROL
示例#4
0
def calc_association(phenotypes, scores, ignore_insufficient_groups):
    # Return a dictionary with keys:
    # n                    Number of samples.
    # m                    Number of groups.
    # scores               n-list of <float>
    # delta                None or <float>
    # phenotypes           n-list of <string>
    # groups               n-list of <int>  [0, length(group_names)-1]
    # group_names          m-list of <string>  (unique list of pheno)
    # num_samples          dict of <group (int)> : <int>
    # mean_score           dict of <group (int)> : <float>
    # p_value              <float>
    # relationship         <string>
    #
    # May return None if there is only 1 group, and
    # ignore_insufficient_groups is a true value.
    from genomicode import jmath
    from genomicode import sortlib
    
    # Select only the samples with phenotype and score information.
    I1 = [i for (i, x) in enumerate(phenotypes) if x]
    I2 = [i for (i, x) in enumerate(scores) if x != ""]
    I = sorted(set.intersection(set(I1), set(I2)))
    assert I, "No valid samples."

    phenotypes = [phenotypes[i] for i in I]
    scores = [float(scores[i]) for i in I]

    # Figure out the groupings.
    #group_names = sorted({}.fromkeys(phenotypes))
    group_names = sortlib.sort_natural({}.fromkeys(phenotypes))
    if len(group_names) < 2 and ignore_insufficient_groups:
        return None
    assert len(group_names) >= 2, "Need at least 2 groups (%s)." % \
           str(group_names)
    groups = [None] * len(phenotypes)
    for i in range(len(phenotypes)):
        x = group_names.index(phenotypes[i])
        groups[i] = x

    # Calculate the association.
    group2scores = {}  # group -> list of scores
    for i in range(len(scores)):
        n = groups[i]
        if n not in group2scores:
            group2scores[n] = []
        group2scores[n].append(scores[i])

    y = scores
    x = [[0]*len(group_names) for i in range(len(y))]
    for i in range(len(groups)):
        x[i][groups[i]] = 1
    jmath.start_R()
    jmath.R_equals(x, "x")
    jmath.R_equals(y, "y")
    jmath.R("m <- aov(y~x)")
    p_value = jmath.R('summary(m)[[1]][["Pr(>F)"]][1]')[0]

    # Count other things.
    num_samples = {}
    for n in group2scores:
        num_samples[n] = len(group2scores[n])
    mean_score = {}
    for n in group2scores:
        mean_score[n] = jmath.mean(group2scores[n])
    # If there are exactly 2 groups, then find the difference between
    # the two groups.
    delta = None   # list of deltas
    if len(group_names) == 2:
        delta = mean_score[1] - mean_score[0]

    # Figure out the relationship.
    relationship = ""
    assert len(group_names) >= 2
    high_score = None
    for n, score in mean_score.iteritems():
        if high_score is not None and score <= high_score:
            continue
        high_score = score
        x1 = "Higher"
        if len(group_names) > 2:
            x1 = "Highest"
        relationship = "%s in %s" % (x1, group_names[n])
    
    SCORE = {}
    SCORE["n"] = len(scores)
    SCORE["m"] = len(group_names)
    SCORE["scores"] = scores
    SCORE["phenotypes"] = phenotypes
    SCORE["groups"] = groups
    SCORE["group_names"] = group_names
    SCORE["num_samples"] = num_samples
    SCORE["mean_score"] = mean_score
    SCORE["delta"] = delta
    SCORE["p_value"] = p_value
    SCORE["relationship"] = relationship
    return SCORE
示例#5
0
def calc_association(survival, dead, scores, rank_cutoffs, zscore_cutoffs,
                     best_cutoff, expression_or_score,
                     ignore_unscored_genesets):
    # Return a dictionary with keys:
    # survival             list of <float>
    # dead                 list of <int>
    # scores               list of <float>
    # groups               list of <int>  [0, length(group_names)-1]
    # group_names          list of <string>
    # p_value              <float>
    # num_samples          dict of <group> : <int>
    # mean_score           dict of <group> : <float>
    # surv50               dict of <group> : <float> or None
    # surv90               dict of <group> : <float> or None
    # hi_score_short_surv  <boolean> or None (no difference in surv)
    # relationship         <string>
    #
    # Can return None if the results can't be calculated, e.g. if
    # there are not enough samples, or not enough groups.
    from genomicode import jmath

    # Select only the samples with both survival, dead, and score
    # information.
    I1 = [i for (i, x) in enumerate(survival) if x]
    I2 = [i for (i, x) in enumerate(dead) if x]
    I3 = [i for (i, x) in enumerate(scores) if x]
    I = sorted(set.intersection(set(I1), set(I2), set(I3)))
    if ignore_unscored_genesets and not I:
        return None
    assert I, "No valid samples."

    survival = [float(survival[i]) for i in I]
    dead = [int(float(dead[i])) for i in I]  # might be 0.0, 1.0
    scores = [scores[i] for i in I]

    # GraphPad Prism filters out the 0's.  Do the same thing here.
    I = [i for (i, x) in enumerate(survival) if x > 0]
    survival = [survival[i] for i in I]
    dead = [dead[i] for i in I]
    scores = [scores[i] for i in I]

    # Figure out the groupings.
    if best_cutoff:
        # Figure out the best way to discretize the scores.
        x = find_best_groups(scores, survival, dead)
        group_names, groups = x
    else:
        x = discretize_scores(scores, rank_cutoffs, zscore_cutoffs,
                              expression_or_score)
        group_names, groups = x

    # May not have two groups, e.g. if there are no outliers.  If this
    # happens, then return None.
    uniq_groups = sorted({}.fromkeys(groups))
    if len(uniq_groups) < 2:
        return None

    # Calculate the KM model.
    surv = calc_km(survival, dead, groups)

    # Clean up the surv dictionary.  If some groups are missing, some
    # of the members will be missing values.  Fix this.
    for i in range(len(group_names)):
        if i not in surv["num_samples"]:
            surv["num_samples"][i] = 0
        if i not in surv["surv50"]:
            surv["surv50"][i] = None
        if i not in surv["surv90"]:
            surv["surv90"][i] = None

    # Add extra data to the survival dictionary.
    surv["survival"] = survival
    surv["dead"] = dead
    surv["scores"] = scores
    surv["groups"] = groups
    surv["group_names"] = group_names

    # Calculate the mean scores for each group.  If a group is empty,
    # then the mean score is None.
    mean_score = {}
    for group in range(len(group_names)):
        s = [s for (s, g) in zip(scores, groups) if g == group]
        m = None
        if s:
            m = jmath.mean(s)
        mean_score[group] = m
    surv["mean_score"] = mean_score

    # Figure out relationship.
    MAX_SURV = 1E10
    # Compare the time to 50% survival for the low and high scoring
    # groups.
    # ASSUMPTION: lowest group has low scores, while highest group has
    # high scores.
    surv_low = surv["surv50"][min(groups)]  # low score
    surv_high = surv["surv50"][max(groups)]  # high score
    # If neither groups drop to 50% survival, compare the time to 90%
    # survival.
    if surv_low is None and surv_high is None:
        surv_low = surv["surv90"][min(groups)]
        surv_high = surv["surv90"][max(groups)]
    if surv_high is None:
        surv_high = MAX_SURV
    if surv_low is None:
        surv_low = MAX_SURV
    assert surv_low <= MAX_SURV and surv_high <= MAX_SURV
    hi_score_short_surv = None
    if surv_high < surv_low:
        hi_score_short_surv = True
    elif surv_high > surv_low:
        hi_score_short_surv = False
    surv["hi_score_short_surv"] = hi_score_short_surv

    relationship = ""
    if hi_score_short_surv:
        relationship = "High %s has shorter time to outcome." % \
                       expression_or_score.lower()
    elif hi_score_short_surv is not None:
        relationship = "Low %s has shorter time to outcome." % \
                       expression_or_score.lower()
    surv["relationship"] = relationship

    return surv