def center_scores(scores, batches, phenotypes, group1, group2): from genomicode import jmath assert len(scores) == len(phenotypes) assert len(batches) == len(phenotypes) batches_all = sorted({}.fromkeys(batches)) scores_c = [None] * len(scores) for batch in batches_all: I = [i for i in range(len(batches)) if batches[i] == batch] scores1, scores2 = [], [] for i in I: pheno = phenotypes[i] if pheno in group1: scores1.append(scores[i]) elif pheno in group2: scores2.append(scores[i]) else: raise AssertionError, "%s not in groups" % pheno assert scores1, "No samples from group1 in batch %s" % batch assert scores2, "No samples from group2 in batch %s" % batch mean1 = jmath.mean(scores1) mean2 = jmath.mean(scores2) n = (mean1 + mean2)/2.0 for i in I: scores_c[i] = scores[i] - n assert None not in scores_c return scores_c
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import math from Betsy import read_label_file from genomicode import jmath import arrayio data_node, cls_node = antecedents # obtain the class label label, label_line, second_line = read_label_file.read( cls_node.identifier) class_num = len(label) assert class_num == 2, 'the number of class is not 2' fc = 1 if 'group_fc_num' in user_options: fc = int(user_options['group_fc_num']) M = arrayio.read(data_node.identifier) first = M.slice(None, label[0][0]) second = M.slice(None, label[1][0]) #X = M.slice() I_good = [] for i in range(M.nrow()): fold_change = abs(jmath.mean(first[i]) - jmath.mean(second[i])) if fold_change >= math.log(fc, 2): I_good.append(i) assert I_good, 'there is no gene is significant in fold change with 2' f = file(outfile, 'w') M_c = M.matrix(I_good, None) arrayio.tab_delimited_format.write(M_c, f) f.close()
def create_control_vars(MATRIX, num_control_vars): import numpy from genomicode import jmath from genomicode import bfrm # Look for the annotations that resemble affymetrix probe set IDs. affx_name = bfrm.get_affy_row_name(MATRIX) # Select the affymetrix control variables. ids = MATRIX.row_names(affx_name) I = [i for (i, id_) in enumerate(ids) if bfrm.is_affx(id_)] assert I AFFX = MATRIX.matrix(I, None) max_control_vars = min(AFFX.nrow(), AFFX.ncol()) assert num_control_vars<= AFFX.nrow() and num_control_vars <= AFFX.ncol(),\ "Too many control variables (%d). Maximum is %d." % ( num_control_vars, max_control_vars) # Calculate the SVD of the control probes. X = AFFX._X # Subtract the means from each gene. for i in range(len(X)): x = X[i] m = jmath.mean(x) x = [x - m for x in x] X[i] = x # Calculate the SVD. U, s, V = numpy.linalg.svd(X, full_matrices=False) # Each row of V is a control factor. CONTROL = V.tolist()[:num_control_vars] assert len(CONTROL) == num_control_vars assert len(CONTROL[0]) == AFFX.ncol() return CONTROL
def calc_association(phenotypes, scores, ignore_insufficient_groups): # Return a dictionary with keys: # n Number of samples. # m Number of groups. # scores n-list of <float> # delta None or <float> # phenotypes n-list of <string> # groups n-list of <int> [0, length(group_names)-1] # group_names m-list of <string> (unique list of pheno) # num_samples dict of <group (int)> : <int> # mean_score dict of <group (int)> : <float> # p_value <float> # relationship <string> # # May return None if there is only 1 group, and # ignore_insufficient_groups is a true value. from genomicode import jmath from genomicode import sortlib # Select only the samples with phenotype and score information. I1 = [i for (i, x) in enumerate(phenotypes) if x] I2 = [i for (i, x) in enumerate(scores) if x != ""] I = sorted(set.intersection(set(I1), set(I2))) assert I, "No valid samples." phenotypes = [phenotypes[i] for i in I] scores = [float(scores[i]) for i in I] # Figure out the groupings. #group_names = sorted({}.fromkeys(phenotypes)) group_names = sortlib.sort_natural({}.fromkeys(phenotypes)) if len(group_names) < 2 and ignore_insufficient_groups: return None assert len(group_names) >= 2, "Need at least 2 groups (%s)." % \ str(group_names) groups = [None] * len(phenotypes) for i in range(len(phenotypes)): x = group_names.index(phenotypes[i]) groups[i] = x # Calculate the association. group2scores = {} # group -> list of scores for i in range(len(scores)): n = groups[i] if n not in group2scores: group2scores[n] = [] group2scores[n].append(scores[i]) y = scores x = [[0]*len(group_names) for i in range(len(y))] for i in range(len(groups)): x[i][groups[i]] = 1 jmath.start_R() jmath.R_equals(x, "x") jmath.R_equals(y, "y") jmath.R("m <- aov(y~x)") p_value = jmath.R('summary(m)[[1]][["Pr(>F)"]][1]')[0] # Count other things. num_samples = {} for n in group2scores: num_samples[n] = len(group2scores[n]) mean_score = {} for n in group2scores: mean_score[n] = jmath.mean(group2scores[n]) # If there are exactly 2 groups, then find the difference between # the two groups. delta = None # list of deltas if len(group_names) == 2: delta = mean_score[1] - mean_score[0] # Figure out the relationship. relationship = "" assert len(group_names) >= 2 high_score = None for n, score in mean_score.iteritems(): if high_score is not None and score <= high_score: continue high_score = score x1 = "Higher" if len(group_names) > 2: x1 = "Highest" relationship = "%s in %s" % (x1, group_names[n]) SCORE = {} SCORE["n"] = len(scores) SCORE["m"] = len(group_names) SCORE["scores"] = scores SCORE["phenotypes"] = phenotypes SCORE["groups"] = groups SCORE["group_names"] = group_names SCORE["num_samples"] = num_samples SCORE["mean_score"] = mean_score SCORE["delta"] = delta SCORE["p_value"] = p_value SCORE["relationship"] = relationship return SCORE
def calc_association(survival, dead, scores, rank_cutoffs, zscore_cutoffs, best_cutoff, expression_or_score, ignore_unscored_genesets): # Return a dictionary with keys: # survival list of <float> # dead list of <int> # scores list of <float> # groups list of <int> [0, length(group_names)-1] # group_names list of <string> # p_value <float> # num_samples dict of <group> : <int> # mean_score dict of <group> : <float> # surv50 dict of <group> : <float> or None # surv90 dict of <group> : <float> or None # hi_score_short_surv <boolean> or None (no difference in surv) # relationship <string> # # Can return None if the results can't be calculated, e.g. if # there are not enough samples, or not enough groups. from genomicode import jmath # Select only the samples with both survival, dead, and score # information. I1 = [i for (i, x) in enumerate(survival) if x] I2 = [i for (i, x) in enumerate(dead) if x] I3 = [i for (i, x) in enumerate(scores) if x] I = sorted(set.intersection(set(I1), set(I2), set(I3))) if ignore_unscored_genesets and not I: return None assert I, "No valid samples." survival = [float(survival[i]) for i in I] dead = [int(float(dead[i])) for i in I] # might be 0.0, 1.0 scores = [scores[i] for i in I] # GraphPad Prism filters out the 0's. Do the same thing here. I = [i for (i, x) in enumerate(survival) if x > 0] survival = [survival[i] for i in I] dead = [dead[i] for i in I] scores = [scores[i] for i in I] # Figure out the groupings. if best_cutoff: # Figure out the best way to discretize the scores. x = find_best_groups(scores, survival, dead) group_names, groups = x else: x = discretize_scores(scores, rank_cutoffs, zscore_cutoffs, expression_or_score) group_names, groups = x # May not have two groups, e.g. if there are no outliers. If this # happens, then return None. uniq_groups = sorted({}.fromkeys(groups)) if len(uniq_groups) < 2: return None # Calculate the KM model. surv = calc_km(survival, dead, groups) # Clean up the surv dictionary. If some groups are missing, some # of the members will be missing values. Fix this. for i in range(len(group_names)): if i not in surv["num_samples"]: surv["num_samples"][i] = 0 if i not in surv["surv50"]: surv["surv50"][i] = None if i not in surv["surv90"]: surv["surv90"][i] = None # Add extra data to the survival dictionary. surv["survival"] = survival surv["dead"] = dead surv["scores"] = scores surv["groups"] = groups surv["group_names"] = group_names # Calculate the mean scores for each group. If a group is empty, # then the mean score is None. mean_score = {} for group in range(len(group_names)): s = [s for (s, g) in zip(scores, groups) if g == group] m = None if s: m = jmath.mean(s) mean_score[group] = m surv["mean_score"] = mean_score # Figure out relationship. MAX_SURV = 1E10 # Compare the time to 50% survival for the low and high scoring # groups. # ASSUMPTION: lowest group has low scores, while highest group has # high scores. surv_low = surv["surv50"][min(groups)] # low score surv_high = surv["surv50"][max(groups)] # high score # If neither groups drop to 50% survival, compare the time to 90% # survival. if surv_low is None and surv_high is None: surv_low = surv["surv90"][min(groups)] surv_high = surv["surv90"][max(groups)] if surv_high is None: surv_high = MAX_SURV if surv_low is None: surv_low = MAX_SURV assert surv_low <= MAX_SURV and surv_high <= MAX_SURV hi_score_short_surv = None if surv_high < surv_low: hi_score_short_surv = True elif surv_high > surv_low: hi_score_short_surv = False surv["hi_score_short_surv"] = hi_score_short_surv relationship = "" if hi_score_short_surv: relationship = "High %s has shorter time to outcome." % \ expression_or_score.lower() elif hi_score_short_surv is not None: relationship = "Low %s has shorter time to outcome." % \ expression_or_score.lower() surv["relationship"] = relationship return surv