def train_MLP(min_size=5, max_size=100): model = MLPClassifier(max_iter=100) param_grid = { 'alpha': [1], 'max_iter': [1000], 'solver': ['adam'], 'activation': ['relu'] } param_grid = { 'hidden_layer_sizes': [(sp_randint.rvs(min_size, max_size), sp_randint.rvs(min_size, max_size), sp_randint.rvs(min_size, max_size), sp_randint.rvs(min_size, max_size)), (sp_randint.rvs(min_size, max_size), sp_randint.rvs(min_size, max_size), sp_randint.rvs(min_size, max_size)), (sp_randint.rvs(min_size, max_size), sp_randint.rvs(min_size, max_size)), (sp_randint.rvs(min_size, max_size), )], 'activation': ['tanh', 'relu'], 'solver': ['sgd', 'adam', 'lbfgs'], 'alpha': sp_expon(scale=.01), 'learning_rate': ['constant', 'adaptive'], 'learning_rate_init': sp_expon(scale=.001), } return model, param_grid
def die_roll(loot_count): # roll the dice roll = randint.rvs(1, 4) + randint.rvs(1, 4) # reduce the roll by the amount of loot caried, but not below zero roll = roll - loot_count if roll < 0: roll = 0 # return the modified roll return roll
def draw_generation(self): self.draws = [] for i in range(self.times): draw_dict = {} draw_dict['beta_draw'] = uniform.rvs(0,1,500) draw_dict['bernoulli_draw'] = uniform.rvs(0,1,[500,1000]) draw_dict['uniform_draw'] = uniform.rvs(0,1,500) draw_dict['host_draw'] = uniform.rvs(0,1,1000) draw_dict['size_state'] = randint.rvs(0,2000) draw_dict['initial_state'] = randint.rvs(0,2000) self.draws.append(draw_dict)
def r_funct(current_key,str_values): df_train = pd.DataFrame.from_records(str_values,columns = df_columns) ''' #0. convert to proper dtypes for col,coltype in data_type_dict.iteritems(): if coltype=='int64': df_train[col] = df_train[col].astype(int) if coltype=='float64': df_train[col] = df_train[col].astype(float) ''' #1. remove constant columns remove = [] for col in df_train.columns: if df_train[col].std() == 0: remove.append(col) df_train = df_train.drop(remove, axis=1) #2. remove duplicated columns remove = [] c = df_train.columns for i in range(len(c)-1): v = df_train[c[i]].values for j in range(i+1,len(c)): if np.array_equal(v,df_train[c[j]].values): remove.append(c[j]) df_train = df_train.drop(remove, axis=1) #REMOVE UNWANTED COLUMNS y_train = df_train['TARGET'].values X_train = df_train.drop(['ID','TARGET'], axis=1).values # params for this randomforest len_train = len(X_train) learning_rate=random.choice([1,.5,.3,.2,.1,.03,.05,.01,.005,.001,.0005,.0001,.00001]) n_estimators=sp_randint.rvs(100, 5000) subsample=random.choice([1,.95,.85,.90,.8]) min_samples_split=sp_randint.rvs(2, 11) min_samples_leaf=sp_randint.rvs(1, 11) max_depth=sp_randint.rvs(2, 20) min_weight_fraction_leaf=0 # kfold cross validation for train data using randomforest. clf = GradientBoostingClassifier(learning_rate=learning_rate,n_estimators=n_estimators,subsample=subsample,min_samples_split=min_samples_split,min_samples_leaf=min_samples_leaf,max_depth=max_depth) k_fold = cross_validation.KFold(len_train, 5) auc_scores_list = [] for k, (train, test) in enumerate(k_fold): clf.fit(X_train[train], y_train[train]) auc_scr = auc_score(y_train[test], clf.predict_proba(X_train[test])[:,1]) auc_scores_list.append(auc_scr) mean = np.mean(auc_scores_list) std = np.std(auc_scores_list) print "GBT:learning_rate:%s,n_estimators:%s,subsample:%s,min_samples_split:%s,min_samples_leaf:%s,min_weight_fraction_leaf:%s,max_depth:%s,mean:%s,std:%s" %(learning_rate,n_estimators,subsample,min_samples_split,min_samples_leaf,min_weight_fraction_leaf,max_depth,mean,std)
def __benchmarkfun(self, talent, benchmarksize): R1 = randint.rvs(1, 20, size=benchmarksize) R2 = randint.rvs(1, 20, size=benchmarksize) R3 = randint.rvs(1, 20, size=benchmarksize) R = zip(R1, R2, R3) testcount = 1 testresults = [] for throw in R: testcount += 1 testresults.append(self.test_silent(talent, throw[0], throw[1], throw[2])) return testresults
def supermarket_log(starting_time, finish_time, warehouse, file): # one day operation """ Simulating one day of restock and sells in a supermarket, the events in the supermarket follow an exponential distribution with an average time between events of 5 minutes. Each time that an event occur, the next event (restock or sell) is chosen with a binomial distribution where a sell has probability 0.65 and a restock 0.35. When a client buy a product, that product is selected randomly uniformly, whereas the quantity is chosen from a binomial with n=(max quantity of the product chosen) and p=0.15 We are making one restock at once, each time that a restock is made the product selected is randomly uniformly chosen, meanwhile the quantity of the product to restock is chosen from a binomial where n=(max quantity allowed in shelves), p=0.65 Parameters ---------- starting_time: supermarket opening time finish_time: supermarket closing time warehouse: class Warehouse where our products catalog is saved, we need this information to know products and their codes in our supermarket file: file path in which save our daily log """ log = [] last_hour = starting_time while last_hour < finish_time: # our loop finish when the last transaction has passed finish_time if binom.rvs(1, 0.65): product_chosen = list( warehouse.products.keys())[randint.rvs(1, 19) - 1] last_hour += timedelta(minutes=float(expon.rvs(scale=5, size=1))) aux = [ 'venta', last_hour, product_chosen, binom.rvs(n=warehouse[product_chosen][0], p=0.15, loc=1) ] log.append(aux) else: last_hour += timedelta(minutes=float(expon.rvs(scale=5, size=1))) product_chosen = list(warehouse.products.keys())[randint.rvs( 0, len(amazon.products) - 1)] log.append([ 'repo', last_hour, product_chosen, binom.rvs(n=warehouse[product_chosen][0], p=0.65, loc=1) ]) with open(file, 'w') as f: text = "" for el in log: text += el[0] + ' ' + format_date(el[1]) + " " + el[2] + " " + str( el[3]) + "\n" f.write(text)
def rvs(self): if not self.size: self.size = randint.rvs(low = self.min_size, high = self.max_size, size = 1) if self.scale: return expon.rvs(loc = self.loc * 0.09, scale = self.scale, size = self.size) else: return expon.rvs(loc = self.loc * 0.09, scale = self.loc * 8.0, size = self.size)
def metadata_filename(): columns = [ "thermostat_id", "equipment_type", "zipcode", "utc_offset", "interval_data_filename", ] n_thermostats = 100 thermostat_ids = [uuid4() for i in range(n_thermostats)] equipment_types = randint.rvs(0, 6, size=n_thermostats) zipcodes = [ "70754", "70722", "70726", "70449", "70442", # "722312" 50 "70443", "70441", "70446", "70447", "70444", # "722312" "70836", "70778", "70770", "70774", "70777", # "722312" "70433", "70437", "70436", "70435", "70438", # "722312" "70744", "70748", "70462", "70465", "70466", # "722312" "70791", "70714", "70711", "70451", "70450", # "722312" "70453", "70455", "70454", "70456", "70809", # "722312" "70806", "70807", "70805", "70769", "70761", # "722312" "70402", "70403", "70401", "70737", "70730", # "722312" "70733", "70739", "70785", "70789", "70706", # "722312" "45341", "45344", "45349", "45319", "45434", # "745700" 55 "60018", "60191", "60193", "60195", "60194", # "725300" 60 "97473", "97449", "97493", "97467", "97459", # "726917" 65 "60421", "60544", "60404", "60408", "60481", # "725345" 70 "36590", "36564", "36606", "36605", "36532", # "722235" 75 "36541", "36544", "36568", "36608", "36609", # "722230" 80 "23106", "23060", "23229", "23222", "23294", # "724029" 85 "13674", "13601", "13606", "13605", "13682", # "726227" 90 "12978", "12972", "12985", "12903", "12901", # "726225" 95 "61051", # "725326" 96 "76207", # "722589" 97 "36362", # "722239" 98 "57233", # "726546" 99 "56289", # "726547" 100 ] utc_offsets = [-7 for _ in range(n_thermostats)] interval_data_filenames = ["thermostat_{}.csv".format(i) for i in thermostat_ids] df = pd.DataFrame({ "thermostat_id": thermostat_ids, "equipment_type": equipment_types, "zipcode": zipcodes, "utc_offset": utc_offsets, "interval_data_filename": interval_data_filenames, }, columns=columns) temp_dir = tempfile.mkdtemp() metadata_filename = os.path.join(temp_dir, "metadata.csv") df.to_csv(metadata_filename, index=False) for interval_data_filename in df.interval_data_filename: fname = os.path.join(temp_dir, interval_data_filename) with open(fname, 'w') as f : f.write("INTERVAL DATA FILE CONTENT") return metadata_filename
def draw(self, K = 10, N = 1*10**5, m = 3, gaussian = False): if self.seed is not None: np.random.seed(self.seed) alphas = gamma.rvs(5, size=m) # shape parameter #print(sum(alphas)) # equivalent sample size self.p = dirichlet.rvs(alpha = alphas, size = 1)[0] self.phi_is = multinomial.rvs(1, self.p, size=N) # draw from categorical p.m.f self.x_draws = np.zeros((N,K)) self.hyper_loc, self.hyper_scale, self.thetas, self.var, self.covs, self.rdraws = dict(), dict(), dict(), tuple(), tuple(), tuple() for i in range(m): self.hyper_loc["mean"+str(i+1)] = norm.rvs(size = 1, loc = 0, scale = 5) self.hyper_scale["scale"+str(i+1)] = 1/gamma.rvs(5, size=1) self.thetas["mean"+str(i+1)] = norm.rvs(size = K, loc = self.hyper_loc["mean"+str(i+1)], scale = self.hyper_scale["scale"+str(i+1)]) self.thetas["Sigma"+str(i+1)] = np.eye(K)*(1/gamma.rvs(5, size=K)) self.thetas["nu"+str(i+1)] = randint.rvs(K+2, K+10, size=1)[0] if gaussian: self.covs += (self.thetas['Sigma'+str(i+1)], ) else: self.covs += (wishart.rvs(df = self.thetas['nu'+str(i+1)], scale = self.thetas['Sigma'+str(i+1)], size=1),) self.var += (self.thetas["nu"+str(i+1)]/(self.thetas["nu"+str(i+1)]-2)*self.covs[i],) # variance covariance matrix of first Student-t component self.rdraws += (np.random.multivariate_normal(self.thetas["mean"+str(i+1)], self.covs[i], N),) self.Phi = np.tile(self.phi_is[:,i], K).reshape(K,N).T # repeat phi vector to match with random matrix self.x_draws += np.multiply(self.Phi, self.rdraws[i]) return self.x_draws
def death_drop(inventory): # create a vector of zeroes inventory_mask = np.zeros(len(inventory)) # select a token at random inventory_mask[randint.rvs(0, len(inventory))] = 1 return inventory_mask
def main(N, fl): X = randint.rvs(2, 65536, size=N) print(X) #fl = "SeqS.in" fd = open(fl, 'w') for x in X: fd.write(f'{x}\n') fd.close()
def draw(self, K=10, N=1 * 10**5, m=3, gaussian=False): """ Inputs: ------- N: sample size K: Dimension of Normal/Student distr. m: number of mixture components """ np.random.seed(self.seed) self.st0 = np.random.get_state() # get initial state of RNG #np.random.set_state(self.st0) print("Drawing from", m, "component mixture distribution.") alphas = gamma.rvs(5, size=m) # shape parameter #print(sum(alphas)) # equivalent sample size self.p = dirichlet.rvs(alpha=alphas, size=1)[0] self.phi_is = multinomial.rvs(1, self.p, size=N) # draw from categorical p.m.f self.x_draws = np.zeros((N, K)) self.hyper_loc, self.hyper_scale, self.thetas, self.var, self.covs, self.rdraws = dict( ), dict(), dict(), tuple(), tuple(), tuple() for i in range(m): self.hyper_loc["mean" + str(i + 1)] = norm.rvs(size=1, loc=0, scale=5) self.hyper_scale["scale" + str(i + 1)] = 1 / gamma.rvs(5, size=1) self.thetas["mean" + str(i + 1)] = norm.rvs( size=K, loc=self.hyper_loc["mean" + str(i + 1)], scale=self.hyper_scale["scale" + str(i + 1)]) self.thetas["Sigma" + str(i + 1)] = np.eye(K) * (1 / gamma.rvs(5, size=K)) self.thetas["nu" + str(i + 1)] = randint.rvs(K + 2, K + 10, size=1)[0] if gaussian: self.covs += (self.thetas['Sigma' + str(i + 1)], ) else: self.covs += (wishart.rvs(df=self.thetas['nu' + str(i + 1)], scale=self.thetas['Sigma' + str(i + 1)], size=1), ) self.var += ( self.thetas["nu" + str(i + 1)] / (self.thetas["nu" + str(i + 1)] - 2) * self.covs[i], ) # variance covariance matrix of first Student-t component self.rdraws += (np.random.multivariate_normal( self.thetas["mean" + str(i + 1)], self.covs[i], N), ) self.Phi = np.tile(self.phi_is[:, i], K).reshape( K, N).T # repeat phi vector to match with random matrix self.x_draws += np.multiply(self.Phi, self.rdraws[i]) return self.x_draws, np.argmax(self.phi_is, 1) # X, latent
def compare_models(player_agent_1, player_agent_2, games=1000): # initiate counter to track win totals wins = np.zeros(2) for i in range(games): # initiate a new game state game = GameState(6) # start turn counter; declare a flag for game end turn = 0 continue_game = True # randomly assign player_agent_1 and player_agent_2 to even/odd turns player_agent_1_turns = randint.rvs(0, 2) # take turns until the game is over while continue_game: # use player_agent_1 on half the turns if turn % 2 == player_agent_1_turns: # take a turn continue_game, turn_taken = take_turn(game, player_agent_1)[0:2] # use player_agent_2 on the other half else: # take a turn continue_game, turn_taken = take_turn(game, player_agent_2)[0:2] # skip to next turn if active player is already back at the sub if turn_taken == False: # update the active player if game.active_player < game.players - 1: game.active_player += 1 else: game.active_player = 0 # next turn continue # update the active player if game.active_player < game.players - 1: game.active_player += 1 else: game.active_player = 0 # increment turn counter turn += 1 # document the winner if np.argmax(game.player_scores) % 2 == player_agent_1_turns: wins[0] += 1 else: wins[1] += 1 # return the win totals return wins
def test_gaussiankde_arguments(self): size = 1000 low = 0 high = 9 data = randint.rvs(low, high, size=size) + norm.rvs(0, 0.1, size=size) dist = GaussianMultivariate(distribution=GaussianKDE(bw_method=0.01)) dist.fit(data) samples = dist.sample(size).to_numpy()[0] d, p = ks_2samp(data, samples) assert p >= 0.05
def get_seed_value(self, new_seed=False): # returns value of seed specified in the text_widget # if that value is zero, return the seed stored in self.seed_value # if that value is zero, produce a new seed if new_seed is true value = self.seed_text_widget.value if value == 0: if new_seed: self.seed_value = randint.rvs(100000, 999999) value = self.seed_value return value
def treatment_effect(self, X=None, y=None, t=None): self.X_ = X self.y_ = y self.t_ = t fold_seeds = randint.rvs(0, 1000, size=self.n_splits, random_state=0).tolist() treatment_effect = self._dml_estimation(fold_seeds) self._is_estimated = True return treatment_effect
def pick_one_numbers_uniformly(low, high): """ Retourne un entier précisé par size entre low et high :param int low: plus pétit élément probable :param int high: plus grand élément probable :param int length: taille de l'échantillon :return int item: """ item = low if low != high: item = list(randint.rvs(low, high, size=1))[0] return item
def rvs(self): if not self.size: self.size = randint.rvs(low=self.min_size, high=self.max_size, size=1) if self.scale: return expon.rvs(loc=self.loc * 0.09, scale=self.scale, size=self.size) else: return expon.rvs(loc=self.loc * 0.09, scale=self.loc * 8.0, size=self.size)
def rvs(self, random_state=42): if len(self.seen) < self.high - self.low - 1: while True: sample = randint.rvs(self.low, self.high, size=1, random_state=random_state)[0] if sample not in self.seen: self.seen.append(sample) return self.size * (sample, ) return self.size * (0, )
def wild_bootstrap(self, beta_null, var): # beta_null is the null hypothesis for var X1 = self.df[var].to_numpy() # get a list of all variables that are not rel time 1 dummy Xvars_no1 = [v for v in self.xvars if v != var[0]] # perform a regression without rel time 1 dummy Y1 = self.Y - beta_null * X1 Xno1 = self.df[Xvars_no1].to_numpy() beta1 = np.linalg.solve(Xno1.T.dot(Xno1), Xno1.T.dot(Y1)) # use the beta to construct the Us U = Y1 - np.dot(Xno1, beta1) # boostrap the Us rand_sign = 2 * randint.rvs(0, 2, size=self.N).reshape(self.N, 1) - 1 newU = np.multiply(U, rand_sign) # construct the wild Y Ywild = np.dot(Xno1, beta1) + X1 * beta_null + newU # get the new beta from the wild Y beta_wild = np.dot(self.XpXi, np.dot(self.X.T, Ywild)) error = Ywild - np.dot(self.X, beta_wild) # and now the clustered-robust std error with similar procedure # as above clustervars = ['index'] newdf = copy.copy(self.df) predictedX = np.sum(np.multiply(self.theta.T, self.X), 1).to_frame() predictedX = predictedX.rename(columns={0: "Yhat"}) # add the residual to the dataframe df_withresid = newdf newdf['e'] = error # .assign(e = lambda x: x[self.yvar] - \ # predictedX.Yhat) df_withresid[clustervars] = self.df[clustervars] # group by the cluster groups = df_withresid.groupby(clustervars) G = len(groups) robust_sum = 0 # cycle through each cluster and create cluster-specific "meat" for key, item in groups: Xgroup = item[self.xvars].to_numpy() egroup = item['e'].to_numpy() egroup = egroup.reshape(len(egroup), 1) cluster_sum = np.matmul(np.matmul(np.matmul(Xgroup.T, egroup), \ egroup.T), Xgroup) robust_sum += cluster_sum # correct for degrees of freedom deg_freedom = (G / (G - 1)) * ((self.N - 1) / (self.N - self.k)) # sandwich together with the bread defined in the class initialization V = deg_freedom * np.matmul(np.matmul(self.XpXi.T, robust_sum), \ self.XpXi) return np.sqrt(np.diag(V))
def singlevisualize(self,result): pylab.rcParams['figure.figsize'] = (30.0, 20.0) for k in result.keys(): avo=plt.subplot(131) fr=plt.subplot(132) con=plt.subplot(133) avo.plot(self.keywords[k],[x['AverageOpinion'] for x in result[k]],label='AverageOpinion',marker='o') avo.plot(self.keywords[k],[x['parameter']['bm'] for x in result[k]],label='host-m',marker='1') avo.plot(self.keywords[k],[x['parameter']['bl'] for x in result[k]],label='host-l',marker='2') avo.plot(self.keywords[k],[x['AverageHost'] for x in result[k]],label='host-ave',marker='3') con.plot(self.keywords[k],[x['ConversionRatio'][0] for x in result[k]],label='PosToNeg',marker='2') con.plot(self.keywords[k],[x['ConversionRatio'][1] for x in result[k]],label='NegToPos',marker='1') for i in result[k]: label=self.conversion[k]+'='+str(round(i['parameter'][k],2)) xaxis=[(i['FinalRatio'][1][x]+i['FinalRatio'][1][x+1])/2 for x in range(0,20)] yaxis=[y/(500*self.times) for y in i['FinalRatio'][0]] fr.plot(xaxis,yaxis,label=label,marker=randint.rvs(0,10)) #tempdf=pd.DataFrame({legend:i['FinalRatio']}) #sns.kdeplot(tempdf[legend],ax=fr,bw=0.05) avo.set_xlim([min(self.keywords[k]),max(self.keywords[k])]) avo.set_ylim([-1,1]) fr.set_xlim([-1,1]) #fr.set_ylim([0,1]) con.set_xlim([min(self.keywords[k]),max(self.keywords[k])]) con.set_ylim([0,1]) avo.set_xlabel(self.conversion[k]) avo.set_ylabel('Average Opinion') fr.set_xlabel('Opinion Value') fr.set_ylabel('Final Ratio') con.set_xlabel(self.conversion[k]) con.set_ylabel('Conversion Ratio') avo.legend() fr.legend() con.legend() plt.show() result[k]=pool.map(self.replication,parameterpool[k]) return result#result is a dictionary
def flow(self, member, queue_node): prob_staying_general = 1 - Network.probability_of_leaving prob_stay_level = (1 - self.proportion_leave[member.level])*prob_staying_general # Added prob of leaving per level stay = bernoulli.rvs(prob_stay_level) if member.level == 8: # There's also probability they will leave #self.exit(member) return elif not stay: #self.exit(member) return else: next_level = member.level + 1 # Determine the next node to visit queue_choice = None min_cost = 0 wait_cost = 0 service_cost = 0 if self.asn_policy == 'Deterministic Wait': edges = queue_node.outgoing_edges min_cost = edges[0].get_wait_cost(member) + 1 # At the least, queue 0 has lower cost for q in range(0, len(edges)): if min_cost > edges[q].get_wait_cost(member): min_cost = edges[q].get_wait_cost(member) queue_choice = edges[q].exit wait_cost = min_cost service_cost = edges[q].get_service_cost(member) if self.asn_policy == 'Deterministic Service': edges = queue_node.outgoing_edges min_cost = edges[0].get_service_cost(member) + 1 # At the least, queue 0 has lower cost for q in range(0, len(edges)): if min_cost > edges[q].get_service_cost(member): min_cost = edges[q].get_service_cost(member) queue_choice = edges[q].exit service_cost = min_cost wait_cost = edges[q].get_wait_cost(member) if self.asn_policy == 'Uniform': choice_total = len(self.network[next_level]) queue_index = randint.rvs(1, choice_total, size=1)[0] queue_choice = self.network[next_level][queue_index] yield self.env.process(member.request(queue_choice, queue_node, wait_cost, service_cost, self.env)) # if we are still in the network self.flow(member, queue_choice)
def push(self, member): arriving_level = member.level # For now, we choose any node in level N+1 if member.level == 8: #self.exit(member) return p = 1 - self.proportion_leave[arriving_level] # Probability they are assessed and leave stay = bernoulli.rvs(p) if not stay: #self.exit(member) return arrival_node = self.network[arriving_level][0] # Arriving node queue_choice = None wait_cost = 0 service_cost = 0 min_cost = 0 if self.asn_policy == 'Deterministic Wait': edges = arrival_node.edges min_cost = edges[0].get_wait_cost(member) + 1 # At the least, queue 0 has lower cost for q in range(0, len(edges)): if min_cost > edges[q].get_wait_cost(member): min_cost = edges[q].get_wait_cost(member) wait_cost = min_cost service_cost = edges[q].get_service_cost(member) queue_choice = edges[q].exit if self.asn_policy == 'Deterministic Service': edges = arrival_node.edges min_cost = edges[0].get_service_cost(member) + 1 # At the least, queue 0 has lower cost for q in range(0, len(edges)): if min_cost > edges[q].get_service_cost(member): min_cost = edges[q].get_service_cost(member) queue_choice = edges[q].exit wait_cost = edges[q].get_wait_cost(member) service_cost = min_cost if self.asn_policy == 'Uniform': choice_total = len(self.network[arriving_level+1]) queue_index = randint.rvs(1, choice_total, size=1)[0] queue_choice = self.network[arriving_level+1][queue_index] yield self.env.process(member.request(queue_choice, arrival_node, wait_cost, service_cost, self.env)) self.env.process(self.flow(member, queue_choice))
def tree_sim(self, cur_state, action): if cur_state is self.death: if action is self.cut: next_state = self.sappling_height reward = -self.replanting_cost else: next_state = self.death reward = 0 else: if action is self.cut: next_state = self.sappling_height reward = self.linear_wood_value * cur_state - self.replanting_cost else: tree_is_dying = bernoulli.rvs(self.proba_of_dying) if tree_is_dying: next_state = self.death reward = -self.maintenance_cost else: next_state = randint.rvs(cur_state, self.max_height + 1) reward = -self.maintenance_cost return next_state, reward
def compare_models(model_set_1, model_set_2, games=1000, noise=0.1): # initiate counter to track win totals wins = np.zeros(2) for i in range(games): # initiate a new game state game = GameState(6) # start turn counter; declare a flag for game end turn = 0 continue_game = True # randomly assign model_set_1 and model_set_2 to even/odd turns model_set_1_turns = randint.rvs(0, 1) # take turns until the game is over while continue_game: # use model_set_1 on half the turns if turn % 2 == model_set_1_turns: # take a turn continue_game, turn_taken, turn_around, pick_up, drop = take_turn( game, *model_set_1, noise) # use model_set_2 on the other half else: # take a turn continue_game, turn_taken, turn_around, pick_up, drop = take_turn( game, *model_set_2, noise) # increment turn counter turn += 1 # document the winner if np.argmax(game.player_scores) % 2 == model_set_1_turns: wins[0] += 1 else: wins[1] += 1 # return the win totals return wins
def drop_decision(self, gamestate): # decide if the model output or a random guess will be used if uniform.rvs(0, 1) <= self.epsilon: # no drop if inventory is empty if sum(gamestate[1:33] != -1) == 0: drop = 0 else: # randomly decide whether to drop a token from those available drop = randint.rvs(0, sum(gamestate[1:33] != -1) + 1) else: # generate a Q-table for the current gamestate selected_action = self.pick_up_model.predict( np.reshape(gamestate, (1, gamestate.shape[0]))) # take the action with the highest Q-value drop = np.argmax(selected_action[0:(sum(gamestate[1:33] != -1) + 1)]) drop = int(drop) # return the decision as an integer # 1-33 mean drop the corresponding item # 0 means no drop return drop
]) preprocessor = ColumnTransformer([ ('numeric_transformer', numeric_transformer, numeric_features), ('categorical_transformer', categorical_transformer, categorical_features) ]) pipeline = Pipeline([ ('preprocessor', preprocessor), ('classifier', classifier) ]) param_distributions = { "classifier__learning_rate": uniform.rvs(0.0001, 0.1, size=x_rscv_n_iter), "classifier__gamma" : uniform.rvs(0, 2, size=x_rscv_n_iter), "classifier__max_depth": randint.rvs(2, 100, size=x_rscv_n_iter), "classifier__colsample_bytree": uniform.rvs(0.1, 0.9, size=x_rscv_n_iter), "classifier__subsample": uniform.rvs(0.1, 0.9, size=x_rscv_n_iter), "classifier__reg_alpha": uniform.rvs(0, 0.9, size=x_rscv_n_iter), "classifier__reg_lambda": uniform.rvs(0.0001, 5, size=x_rscv_n_iter), "classifier__min_child_weight": randint.rvs(1, 7, size=x_rscv_n_iter), "classifier__n_estimators": randint.rvs(100, 1000, size=x_rscv_n_iter) } search = RandomizedSearchCV( pipeline, param_distributions=param_distributions, n_iter=x_rscv_n_iter, scoring={'recall_score': recall_scorer, 'f1_score': f1_scorer}, n_jobs=-1, cv=x_rscv, random_state=x_random_state, refit='f1_score', return_train_score=True) search = search.fit(X, y) print(datetime.now()," [3/4] Algorithmus hat zu Ende berechnet")
def get_fake_output_df(n_columns): columns = [ 'sw_version', 'ct_identifier', 'equipment_type', 'heating_or_cooling', 'station', 'zipcode', 'climate_zone', 'start_date', 'end_date', 'n_days_in_inputfile_date_range', 'n_days_both_heating_and_cooling', 'n_days_insufficient_data', 'n_core_cooling_days', 'n_core_heating_days', 'baseline_percentile_core_cooling_comfort_temperature', 'baseline_percentile_core_heating_comfort_temperature', 'regional_average_baseline_cooling_comfort_temperature', 'regional_average_baseline_heating_comfort_temperature', 'percent_savings_baseline_percentile', 'avoided_daily_mean_core_day_runtime_baseline_percentile', 'avoided_total_core_day_runtime_baseline_percentile', 'baseline_daily_mean_core_day_runtime_baseline_percentile', 'baseline_total_core_day_runtime_baseline_percentile', '_daily_mean_core_day_demand_baseline_baseline_percentile', 'percent_savings_baseline_regional', 'avoided_daily_mean_core_day_runtime_baseline_regional', 'avoided_total_core_day_runtime_baseline_regional', 'baseline_daily_mean_core_day_runtime_baseline_regional', 'baseline_total_core_day_runtime_baseline_regional', '_daily_mean_core_day_demand_baseline_baseline_regional', 'mean_demand', 'alpha', 'tau', 'mean_sq_err', 'root_mean_sq_err', 'cv_root_mean_sq_err', 'mean_abs_err', 'mean_abs_pct_err', 'total_core_cooling_runtime', 'total_core_heating_runtime', 'total_auxiliary_heating_core_day_runtime', 'total_emergency_heating_core_day_runtime', 'daily_mean_core_cooling_runtime', 'daily_mean_core_heating_runtime', 'core_cooling_days_mean_indoor_temperature', 'core_cooling_days_mean_outdoor_temperature', 'core_heating_days_mean_indoor_temperature', 'core_heating_days_mean_outdoor_temperature', 'core_mean_indoor_temperature', 'core_mean_outdoor_temperature', 'rhu1_aux_duty_cycle', 'rhu1_emg_duty_cycle', 'rhu1_compressor_duty_cycle', 'rhu1_00F_to_05F', 'rhu1_05F_to_10F', 'rhu1_10F_to_15F', 'rhu1_15F_to_20F', 'rhu1_20F_to_25F', 'rhu1_25F_to_30F', 'rhu1_30F_to_35F', 'rhu1_35F_to_40F', 'rhu1_40F_to_45F', 'rhu1_45F_to_50F', 'rhu1_50F_to_55F', 'rhu1_55F_to_60F', 'rhu1_less10F', 'rhu1_10F_to_20F', 'rhu1_20F_to_30F', 'rhu1_30F_to_40F', 'rhu1_40F_to_50F', 'rhu1_50F_to_60F', 'rhu1_00F_to_05F_aux_duty_cycle', 'rhu1_05F_to_10F_aux_duty_cycle', 'rhu1_10F_to_15F_aux_duty_cycle', 'rhu1_15F_to_20F_aux_duty_cycle', 'rhu1_20F_to_25F_aux_duty_cycle', 'rhu1_25F_to_30F_aux_duty_cycle', 'rhu1_30F_to_35F_aux_duty_cycle', 'rhu1_35F_to_40F_aux_duty_cycle', 'rhu1_40F_to_45F_aux_duty_cycle', 'rhu1_45F_to_50F_aux_duty_cycle', 'rhu1_50F_to_55F_aux_duty_cycle', 'rhu1_55F_to_60F_aux_duty_cycle', 'rhu1_less10F_aux_duty_cycle', 'rhu1_10F_to_20F_aux_duty_cycle', 'rhu1_20F_to_30F_aux_duty_cycle', 'rhu1_30F_to_40F_aux_duty_cycle', 'rhu1_40F_to_50F_aux_duty_cycle', 'rhu1_50F_to_60F_aux_duty_cycle', 'rhu1_00F_to_05F_emg_duty_cycle', 'rhu1_05F_to_10F_emg_duty_cycle', 'rhu1_10F_to_15F_emg_duty_cycle', 'rhu1_15F_to_20F_emg_duty_cycle', 'rhu1_20F_to_25F_emg_duty_cycle', 'rhu1_25F_to_30F_emg_duty_cycle', 'rhu1_30F_to_35F_emg_duty_cycle', 'rhu1_35F_to_40F_emg_duty_cycle', 'rhu1_40F_to_45F_emg_duty_cycle', 'rhu1_45F_to_50F_emg_duty_cycle', 'rhu1_50F_to_55F_emg_duty_cycle', 'rhu1_55F_to_60F_emg_duty_cycle', 'rhu1_less10F_emg_duty_cycle', 'rhu1_10F_to_20F_emg_duty_cycle', 'rhu1_20F_to_30F_emg_duty_cycle', 'rhu1_30F_to_40F_emg_duty_cycle', 'rhu1_40F_to_50F_emg_duty_cycle', 'rhu1_50F_to_60F_emg_duty_cycle', 'rhu1_00F_to_05F_compressor_duty_cycle', 'rhu1_05F_to_10F_compressor_duty_cycle', 'rhu1_10F_to_15F_compressor_duty_cycle', 'rhu1_15F_to_20F_compressor_duty_cycle', 'rhu1_20F_to_25F_compressor_duty_cycle', 'rhu1_25F_to_30F_compressor_duty_cycle', 'rhu1_30F_to_35F_compressor_duty_cycle', 'rhu1_35F_to_40F_compressor_duty_cycle', 'rhu1_40F_to_45F_compressor_duty_cycle', 'rhu1_45F_to_50F_compressor_duty_cycle', 'rhu1_50F_to_55F_compressor_duty_cycle', 'rhu1_55F_to_60F_compressor_duty_cycle', 'rhu1_less10F_compressor_duty_cycle', 'rhu1_10F_to_20F_compressor_duty_cycle', 'rhu1_20F_to_30F_compressor_duty_cycle', 'rhu1_30F_to_40F_compressor_duty_cycle', 'rhu1_40F_to_50F_compressor_duty_cycle', 'rhu1_50F_to_60F_compressor_duty_cycle', 'rhu2_aux_duty_cycle', 'rhu2_emg_duty_cycle', 'rhu2_compressor_duty_cycle', 'rhu2_00F_to_05F', 'rhu2_05F_to_10F', 'rhu2_10F_to_15F', 'rhu2_15F_to_20F', 'rhu2_20F_to_25F', 'rhu2_25F_to_30F', 'rhu2_30F_to_35F', 'rhu2_35F_to_40F', 'rhu2_40F_to_45F', 'rhu2_45F_to_50F', 'rhu2_50F_to_55F', 'rhu2_55F_to_60F', 'rhu2_less10F', 'rhu2_10F_to_20F', 'rhu2_20F_to_30F', 'rhu2_30F_to_40F', 'rhu2_40F_to_50F', 'rhu2_50F_to_60F', 'rhu2_00F_to_05F_aux_duty_cycle', 'rhu2_05F_to_10F_aux_duty_cycle', 'rhu2_10F_to_15F_aux_duty_cycle', 'rhu2_15F_to_20F_aux_duty_cycle', 'rhu2_20F_to_25F_aux_duty_cycle', 'rhu2_25F_to_30F_aux_duty_cycle', 'rhu2_30F_to_35F_aux_duty_cycle', 'rhu2_35F_to_40F_aux_duty_cycle', 'rhu2_40F_to_45F_aux_duty_cycle', 'rhu2_45F_to_50F_aux_duty_cycle', 'rhu2_50F_to_55F_aux_duty_cycle', 'rhu2_55F_to_60F_aux_duty_cycle', 'rhu2_less10F_aux_duty_cycle', 'rhu2_10F_to_20F_aux_duty_cycle', 'rhu2_20F_to_30F_aux_duty_cycle', 'rhu2_30F_to_40F_aux_duty_cycle', 'rhu2_40F_to_50F_aux_duty_cycle', 'rhu2_50F_to_60F_aux_duty_cycle', 'rhu2_00F_to_05F_emg_duty_cycle', 'rhu2_05F_to_10F_emg_duty_cycle', 'rhu2_10F_to_15F_emg_duty_cycle', 'rhu2_15F_to_20F_emg_duty_cycle', 'rhu2_20F_to_25F_emg_duty_cycle', 'rhu2_25F_to_30F_emg_duty_cycle', 'rhu2_30F_to_35F_emg_duty_cycle', 'rhu2_35F_to_40F_emg_duty_cycle', 'rhu2_40F_to_45F_emg_duty_cycle', 'rhu2_45F_to_50F_emg_duty_cycle', 'rhu2_50F_to_55F_emg_duty_cycle', 'rhu2_55F_to_60F_emg_duty_cycle', 'rhu2_less10F_emg_duty_cycle', 'rhu2_10F_to_20F_emg_duty_cycle', 'rhu2_20F_to_30F_emg_duty_cycle', 'rhu2_30F_to_40F_emg_duty_cycle', 'rhu2_40F_to_50F_emg_duty_cycle', 'rhu2_50F_to_60F_emg_duty_cycle', 'rhu2_00F_to_05F_compressor_duty_cycle', 'rhu2_05F_to_10F_compressor_duty_cycle', 'rhu2_10F_to_15F_compressor_duty_cycle', 'rhu2_15F_to_20F_compressor_duty_cycle', 'rhu2_20F_to_25F_compressor_duty_cycle', 'rhu2_25F_to_30F_compressor_duty_cycle', 'rhu2_30F_to_35F_compressor_duty_cycle', 'rhu2_35F_to_40F_compressor_duty_cycle', 'rhu2_40F_to_45F_compressor_duty_cycle', 'rhu2_45F_to_50F_compressor_duty_cycle', 'rhu2_50F_to_55F_compressor_duty_cycle', 'rhu2_55F_to_60F_compressor_duty_cycle', 'rhu2_less10F_compressor_duty_cycle', 'rhu2_10F_to_20F_compressor_duty_cycle', 'rhu2_20F_to_30F_compressor_duty_cycle', 'rhu2_30F_to_40F_compressor_duty_cycle', 'rhu2_40F_to_50F_compressor_duty_cycle', 'rhu2_50F_to_60F_compressor_duty_cycle', ] string_placeholder = ["PLACEHOLDER"] * n_columns zero_column = [0 if randint.rvs(0, 30) > 0 else (None if randint.rvs(0, 2) > 0 else np.inf) for i in randint.rvs(0, 1, size=n_columns)] one_column = [1 if randint.rvs(0, 30) > 0 else (None if randint.rvs(0, 2) > 0 else np.inf) for i in randint.rvs(0, 1, size=n_columns)] float_column = [i if randint.rvs(0, 30) > 0 else (None if randint.rvs(0, 2) > 0 else np.inf) for i in norm.rvs(size=n_columns)] zipcodes = ["01234", "12345", "23456", "34567", "43210", "54321", "65432", "76543"] zipcode_column = [i for i in islice(cycle(zipcodes), None, n_columns)] core_day_set_names = ["cooling_2012", "heating_2012-2013", "cooling_2013"] core_day_set_name_column = [i for i in islice(cycle(core_day_set_names), None, n_columns)] data = { 'sw_version': string_placeholder, 'ct_identifier': string_placeholder, 'equipment_type': string_placeholder, 'heating_or_cooling': core_day_set_name_column, 'station': string_placeholder, 'zipcode': zipcode_column, 'climate_zone': string_placeholder, 'start_date': datetime(2011, 1, 1), 'end_date': datetime(2012, 1, 1), 'n_days_both_heating_and_cooling': one_column, 'n_days_in_inputfile_date_range': one_column, 'n_days_insufficient_data': zero_column, 'n_core_heating_days': one_column, 'baseline_percentile_core_cooling_comfort_temperature': float_column, 'baseline_percentile_core_heating_comfort_temperature': float_column, 'regional_average_baseline_cooling_comfort_temperature': float_column, 'regional_average_baseline_heating_comfort_temperature': float_column, 'percent_savings_baseline_percentile': float_column, 'avoided_daily_mean_core_day_runtime_baseline_percentile': float_column, 'avoided_total_core_day_runtime_baseline_percentile': float_column, 'baseline_daily_mean_core_day_runtime_baseline_percentile': float_column, 'baseline_total_core_day_runtime_baseline_percentile': float_column, '_daily_mean_core_day_demand_baseline_baseline_percentile': float_column, 'percent_savings_baseline_regional': float_column, 'avoided_daily_mean_core_day_runtime_baseline_regional': float_column, 'avoided_total_core_day_runtime_baseline_regional': float_column, 'baseline_daily_mean_core_day_runtime_baseline_regional': float_column, 'baseline_total_core_day_runtime_baseline_regional': float_column, '_daily_mean_core_day_demand_baseline_baseline_regional': float_column, 'mean_demand': float_column, 'alpha': float_column, 'tau': float_column, 'mean_sq_err': float_column, 'root_mean_sq_err': float_column, 'cv_root_mean_sq_err': float_column, 'mean_abs_err': float_column, 'mean_abs_pct_err': float_column, 'total_core_cooling_runtime': float_column, 'total_core_heating_runtime': float_column, 'total_auxiliary_heating_core_day_runtime': float_column, 'total_emergency_heating_core_day_runtime': float_column, 'daily_mean_core_cooling_runtime': float_column, 'daily_mean_core_heating_runtime': float_column, 'core_cooling_days_mean_indoor_temperature': float_column, 'core_cooling_days_mean_outdoor_temperature': float_column, 'core_heating_days_mean_indoor_temperature': float_column, 'core_heating_days_mean_outdoor_temperature': float_column, 'core_mean_indoor_temperature': float_column, 'core_mean_outdoor_temperature': float_column, 'rhu1_aux_duty_cycle': float_column, 'rhu1_emg_duty_cycle': float_column, 'rhu1_compressor_duty_cycle': float_column, 'rhu1_00F_to_05F': float_column, 'rhu1_05F_to_10F': float_column, 'rhu1_10F_to_15F': float_column, 'rhu1_15F_to_20F': float_column, 'rhu1_20F_to_25F': float_column, 'rhu1_25F_to_30F': float_column, 'rhu1_30F_to_35F': float_column, 'rhu1_35F_to_40F': float_column, 'rhu1_40F_to_45F': float_column, 'rhu1_45F_to_50F': float_column, 'rhu1_50F_to_55F': float_column, 'rhu1_55F_to_60F': float_column, 'rhu1_less10F': float_column, 'rhu1_10F_to_20F': float_column, 'rhu1_20F_to_30F': float_column, 'rhu1_30F_to_40F': float_column, 'rhu1_40F_to_50F': float_column, 'rhu1_50F_to_60F': float_column, 'rhu1_00F_to_05F_aux_duty_cycle': float_column, 'rhu1_05F_to_10F_aux_duty_cycle': float_column, 'rhu1_10F_to_15F_aux_duty_cycle': float_column, 'rhu1_15F_to_20F_aux_duty_cycle': float_column, 'rhu1_20F_to_25F_aux_duty_cycle': float_column, 'rhu1_25F_to_30F_aux_duty_cycle': float_column, 'rhu1_30F_to_35F_aux_duty_cycle': float_column, 'rhu1_35F_to_40F_aux_duty_cycle': float_column, 'rhu1_40F_to_45F_aux_duty_cycle': float_column, 'rhu1_45F_to_50F_aux_duty_cycle': float_column, 'rhu1_50F_to_55F_aux_duty_cycle': float_column, 'rhu1_55F_to_60F_aux_duty_cycle': float_column, 'rhu1_less10F_aux_duty_cycle': float_column, 'rhu1_10F_to_20F_aux_duty_cycle': float_column, 'rhu1_20F_to_30F_aux_duty_cycle': float_column, 'rhu1_30F_to_40F_aux_duty_cycle': float_column, 'rhu1_40F_to_50F_aux_duty_cycle': float_column, 'rhu1_50F_to_60F_aux_duty_cycle': float_column, 'rhu1_00F_to_05F_emg_duty_cycle': float_column, 'rhu1_05F_to_10F_emg_duty_cycle': float_column, 'rhu1_10F_to_15F_emg_duty_cycle': float_column, 'rhu1_15F_to_20F_emg_duty_cycle': float_column, 'rhu1_20F_to_25F_emg_duty_cycle': float_column, 'rhu1_25F_to_30F_emg_duty_cycle': float_column, 'rhu1_30F_to_35F_emg_duty_cycle': float_column, 'rhu1_35F_to_40F_emg_duty_cycle': float_column, 'rhu1_40F_to_45F_emg_duty_cycle': float_column, 'rhu1_45F_to_50F_emg_duty_cycle': float_column, 'rhu1_50F_to_55F_emg_duty_cycle': float_column, 'rhu1_55F_to_60F_emg_duty_cycle': float_column, 'rhu1_less10F_emg_duty_cycle': float_column, 'rhu1_10F_to_20F_emg_duty_cycle': float_column, 'rhu1_20F_to_30F_emg_duty_cycle': float_column, 'rhu1_30F_to_40F_emg_duty_cycle': float_column, 'rhu1_40F_to_50F_emg_duty_cycle': float_column, 'rhu1_50F_to_60F_emg_duty_cycle': float_column, 'rhu1_00F_to_05F_compressor_duty_cycle': float_column, 'rhu1_05F_to_10F_compressor_duty_cycle': float_column, 'rhu1_10F_to_15F_compressor_duty_cycle': float_column, 'rhu1_15F_to_20F_compressor_duty_cycle': float_column, 'rhu1_20F_to_25F_compressor_duty_cycle': float_column, 'rhu1_25F_to_30F_compressor_duty_cycle': float_column, 'rhu1_30F_to_35F_compressor_duty_cycle': float_column, 'rhu1_35F_to_40F_compressor_duty_cycle': float_column, 'rhu1_40F_to_45F_compressor_duty_cycle': float_column, 'rhu1_45F_to_50F_compressor_duty_cycle': float_column, 'rhu1_50F_to_55F_compressor_duty_cycle': float_column, 'rhu1_55F_to_60F_compressor_duty_cycle': float_column, 'rhu1_less10F_compressor_duty_cycle': float_column, 'rhu1_10F_to_20F_compressor_duty_cycle': float_column, 'rhu1_20F_to_30F_compressor_duty_cycle': float_column, 'rhu1_30F_to_40F_compressor_duty_cycle': float_column, 'rhu1_40F_to_50F_compressor_duty_cycle': float_column, 'rhu1_50F_to_60F_compressor_duty_cycle': float_column, 'rhu2_aux_duty_cycle': float_column, 'rhu2_emg_duty_cycle': float_column, 'rhu2_compressor_duty_cycle': float_column, 'rhu2_00F_to_05F': float_column, 'rhu2_05F_to_10F': float_column, 'rhu2_10F_to_15F': float_column, 'rhu2_15F_to_20F': float_column, 'rhu2_20F_to_25F': float_column, 'rhu2_25F_to_30F': float_column, 'rhu2_30F_to_35F': float_column, 'rhu2_35F_to_40F': float_column, 'rhu2_40F_to_45F': float_column, 'rhu2_45F_to_50F': float_column, 'rhu2_50F_to_55F': float_column, 'rhu2_55F_to_60F': float_column, 'rhu2_less10F': float_column, 'rhu2_10F_to_20F': float_column, 'rhu2_20F_to_30F': float_column, 'rhu2_30F_to_40F': float_column, 'rhu2_40F_to_50F': float_column, 'rhu2_50F_to_60F': float_column, 'rhu2_00F_to_05F_aux_duty_cycle': float_column, 'rhu2_05F_to_10F_aux_duty_cycle': float_column, 'rhu2_10F_to_15F_aux_duty_cycle': float_column, 'rhu2_15F_to_20F_aux_duty_cycle': float_column, 'rhu2_20F_to_25F_aux_duty_cycle': float_column, 'rhu2_25F_to_30F_aux_duty_cycle': float_column, 'rhu2_30F_to_35F_aux_duty_cycle': float_column, 'rhu2_35F_to_40F_aux_duty_cycle': float_column, 'rhu2_40F_to_45F_aux_duty_cycle': float_column, 'rhu2_45F_to_50F_aux_duty_cycle': float_column, 'rhu2_50F_to_55F_aux_duty_cycle': float_column, 'rhu2_55F_to_60F_aux_duty_cycle': float_column, 'rhu2_less10F_aux_duty_cycle': float_column, 'rhu2_10F_to_20F_aux_duty_cycle': float_column, 'rhu2_20F_to_30F_aux_duty_cycle': float_column, 'rhu2_30F_to_40F_aux_duty_cycle': float_column, 'rhu2_40F_to_50F_aux_duty_cycle': float_column, 'rhu2_50F_to_60F_aux_duty_cycle': float_column, 'rhu2_00F_to_05F_emg_duty_cycle': float_column, 'rhu2_05F_to_10F_emg_duty_cycle': float_column, 'rhu2_10F_to_15F_emg_duty_cycle': float_column, 'rhu2_15F_to_20F_emg_duty_cycle': float_column, 'rhu2_20F_to_25F_emg_duty_cycle': float_column, 'rhu2_25F_to_30F_emg_duty_cycle': float_column, 'rhu2_30F_to_35F_emg_duty_cycle': float_column, 'rhu2_35F_to_40F_emg_duty_cycle': float_column, 'rhu2_40F_to_45F_emg_duty_cycle': float_column, 'rhu2_45F_to_50F_emg_duty_cycle': float_column, 'rhu2_50F_to_55F_emg_duty_cycle': float_column, 'rhu2_55F_to_60F_emg_duty_cycle': float_column, 'rhu2_less10F_emg_duty_cycle': float_column, 'rhu2_10F_to_20F_emg_duty_cycle': float_column, 'rhu2_20F_to_30F_emg_duty_cycle': float_column, 'rhu2_30F_to_40F_emg_duty_cycle': float_column, 'rhu2_40F_to_50F_emg_duty_cycle': float_column, 'rhu2_50F_to_60F_emg_duty_cycle': float_column, 'rhu2_00F_to_05F_compressor_duty_cycle': float_column, 'rhu2_05F_to_10F_compressor_duty_cycle': float_column, 'rhu2_10F_to_15F_compressor_duty_cycle': float_column, 'rhu2_15F_to_20F_compressor_duty_cycle': float_column, 'rhu2_20F_to_25F_compressor_duty_cycle': float_column, 'rhu2_25F_to_30F_compressor_duty_cycle': float_column, 'rhu2_30F_to_35F_compressor_duty_cycle': float_column, 'rhu2_35F_to_40F_compressor_duty_cycle': float_column, 'rhu2_40F_to_45F_compressor_duty_cycle': float_column, 'rhu2_45F_to_50F_compressor_duty_cycle': float_column, 'rhu2_50F_to_55F_compressor_duty_cycle': float_column, 'rhu2_55F_to_60F_compressor_duty_cycle': float_column, 'rhu2_less10F_compressor_duty_cycle': float_column, 'rhu2_10F_to_20F_compressor_duty_cycle': float_column, 'rhu2_20F_to_30F_compressor_duty_cycle': float_column, 'rhu2_30F_to_40F_compressor_duty_cycle': float_column, 'rhu2_40F_to_50F_compressor_duty_cycle': float_column, 'rhu2_50F_to_60F_compressor_duty_cycle': float_column, } df = pd.DataFrame(data, columns=columns) return df
def _fit(self, X, y): from scipy.stats import randint randidx = randint.rvs(0, len(y), size=10) counts = np.bincount(randidx) self.majority_ = np.argmax(counts)
def test_nchypergeom_wallenius_naive(self): # test against a very simple implementation np.random.seed(2) shape = (2, 4, 3) max_m = 100 m1 = np.random.randint(1, max_m, size=shape) m2 = np.random.randint(1, max_m, size=shape) N = m1 + m2 n = randint.rvs(0, N, size=N.shape) xl = np.maximum(0, n - m2) xu = np.minimum(n, m1) x = randint.rvs(xl, xu, size=xl.shape) w = np.random.rand(*x.shape) * 2 def support(N, m1, n, w): m2 = N - m1 xl = np.maximum(0, n - m2) xu = np.minimum(n, m1) return xl, xu @np.vectorize def mean(N, m1, n, w): m2 = N - m1 xl, xu = support(N, m1, n, w) def fun(u): return u / m1 + (1 - (n - u) / m2)**w - 1 return root_scalar(fun, bracket=(xl, xu)).root assert_allclose(nchypergeom_wallenius.mean(N, m1, n, w), mean(N, m1, n, w), rtol=2e-2) @np.vectorize def variance(N, m1, n, w): m2 = N - m1 u = mean(N, m1, n, w) a = u * (m1 - u) b = (n - u) * (u + m2 - n) return N * a * b / ((N - 1) * (m1 * b + m2 * a)) assert_allclose(nchypergeom_wallenius.stats(N, m1, n, w, moments='v'), variance(N, m1, n, w), rtol=5e-2) @np.vectorize def pmf(x, N, m1, n, w): m2 = N - m1 xl, xu = support(N, m1, n, w) def integrand(t): D = w * (m1 - x) + (m2 - (n - x)) res = (1 - t**(w / D))**x * (1 - t**(1 / D))**(n - x) return res def f(x): t1 = special_binom(m1, x) t2 = special_binom(m2, n - x) the_integral = quad(integrand, 0, 1, epsrel=1e-16, epsabs=1e-16) return t1 * t2 * the_integral[0] return f(x) pmf0 = pmf(x, N, m1, n, w) pmf1 = nchypergeom_wallenius.pmf(x, N, m1, n, w) atol, rtol = 1e-6, 1e-6 i = np.abs(pmf1 - pmf0) < atol + rtol * np.abs(pmf0) assert (i.sum() > np.prod(shape) / 2) # works at least half the time # for those that fail, discredit the naive implementation for N, m1, n, w in zip(N[~i], m1[~i], n[~i], w[~i]): # get the support m2 = N - m1 xl, xu = support(N, m1, n, w) x = np.arange(xl, xu + 1) # calculate sum of pmf over the support # the naive implementation is very wrong in these cases assert pmf(x, N, m1, n, w).sum() < .5 assert_allclose(nchypergeom_wallenius.pmf(x, N, m1, n, w).sum(), 1)
class TestNCH(): np.random.seed(2) # seeds 0 and 1 had some xl = xu; randint failed shape = (2, 4, 3) max_m = 100 m1 = np.random.randint(1, max_m, size=shape) # red balls m2 = np.random.randint(1, max_m, size=shape) # white balls N = m1 + m2 # total balls n = randint.rvs(0, N, size=N.shape) # number of draws xl = np.maximum(0, n - m2) # lower bound of support xu = np.minimum(n, m1) # upper bound of support x = randint.rvs(xl, xu, size=xl.shape) odds = np.random.rand(*x.shape) * 2 # test output is more readable when function names (strings) are passed @pytest.mark.parametrize('dist_name', ['nchypergeom_fisher', 'nchypergeom_wallenius']) def test_nch_hypergeom(self, dist_name): # Both noncentral hypergeometric distributions reduce to the # hypergeometric distribution when odds = 1 dists = { 'nchypergeom_fisher': nchypergeom_fisher, 'nchypergeom_wallenius': nchypergeom_wallenius } dist = dists[dist_name] x, N, m1, n = self.x, self.N, self.m1, self.n assert_allclose(dist.pmf(x, N, m1, n, odds=1), hypergeom.pmf(x, N, m1, n)) def test_nchypergeom_fisher_naive(self): # test against a very simple implementation x, N, m1, n, odds = self.x, self.N, self.m1, self.n, self.odds @np.vectorize def pmf_mean_var(x, N, m1, n, w): # simple implementation of nchypergeom_fisher pmf m2 = N - m1 xl = np.maximum(0, n - m2) xu = np.minimum(n, m1) def f(x): t1 = special_binom(m1, x) t2 = special_binom(m2, n - x) return t1 * t2 * w**x def P(k): return sum((f(y) * y**k for y in range(xl, xu + 1))) P0 = P(0) P1 = P(1) P2 = P(2) pmf = f(x) / P0 mean = P1 / P0 var = P2 / P0 - (P1 / P0)**2 return pmf, mean, var pmf, mean, var = pmf_mean_var(x, N, m1, n, odds) assert_allclose(nchypergeom_fisher.pmf(x, N, m1, n, odds), pmf) assert_allclose(nchypergeom_fisher.stats(N, m1, n, odds, moments='m'), mean) assert_allclose(nchypergeom_fisher.stats(N, m1, n, odds, moments='v'), var) def test_nchypergeom_wallenius_naive(self): # test against a very simple implementation np.random.seed(2) shape = (2, 4, 3) max_m = 100 m1 = np.random.randint(1, max_m, size=shape) m2 = np.random.randint(1, max_m, size=shape) N = m1 + m2 n = randint.rvs(0, N, size=N.shape) xl = np.maximum(0, n - m2) xu = np.minimum(n, m1) x = randint.rvs(xl, xu, size=xl.shape) w = np.random.rand(*x.shape) * 2 def support(N, m1, n, w): m2 = N - m1 xl = np.maximum(0, n - m2) xu = np.minimum(n, m1) return xl, xu @np.vectorize def mean(N, m1, n, w): m2 = N - m1 xl, xu = support(N, m1, n, w) def fun(u): return u / m1 + (1 - (n - u) / m2)**w - 1 return root_scalar(fun, bracket=(xl, xu)).root assert_allclose(nchypergeom_wallenius.mean(N, m1, n, w), mean(N, m1, n, w), rtol=2e-2) @np.vectorize def variance(N, m1, n, w): m2 = N - m1 u = mean(N, m1, n, w) a = u * (m1 - u) b = (n - u) * (u + m2 - n) return N * a * b / ((N - 1) * (m1 * b + m2 * a)) assert_allclose(nchypergeom_wallenius.stats(N, m1, n, w, moments='v'), variance(N, m1, n, w), rtol=5e-2) @np.vectorize def pmf(x, N, m1, n, w): m2 = N - m1 xl, xu = support(N, m1, n, w) def integrand(t): D = w * (m1 - x) + (m2 - (n - x)) res = (1 - t**(w / D))**x * (1 - t**(1 / D))**(n - x) return res def f(x): t1 = special_binom(m1, x) t2 = special_binom(m2, n - x) the_integral = quad(integrand, 0, 1, epsrel=1e-16, epsabs=1e-16) return t1 * t2 * the_integral[0] return f(x) pmf0 = pmf(x, N, m1, n, w) pmf1 = nchypergeom_wallenius.pmf(x, N, m1, n, w) atol, rtol = 1e-6, 1e-6 i = np.abs(pmf1 - pmf0) < atol + rtol * np.abs(pmf0) assert (i.sum() > np.prod(shape) / 2) # works at least half the time # for those that fail, discredit the naive implementation for N, m1, n, w in zip(N[~i], m1[~i], n[~i], w[~i]): # get the support m2 = N - m1 xl, xu = support(N, m1, n, w) x = np.arange(xl, xu + 1) # calculate sum of pmf over the support # the naive implementation is very wrong in these cases assert pmf(x, N, m1, n, w).sum() < .5 assert_allclose(nchypergeom_wallenius.pmf(x, N, m1, n, w).sum(), 1) def test_wallenius_against_mpmath(self): # precompute data with mpmath since naive implementation above # is not reliable. See source code in gh-13330. M = 50 n = 30 N = 20 odds = 2.25 # Expected results, computed with mpmath. sup = np.arange(21) pmf = np.array([ 3.699003068656875e-20, 5.89398584245431e-17, 2.1594437742911123e-14, 3.221458044649955e-12, 2.4658279241205077e-10, 1.0965862603981212e-08, 3.057890479665704e-07, 5.622818831643761e-06, 7.056482841531681e-05, 0.000618899425358671, 0.003854172932571669, 0.01720592676256026, 0.05528844897093792, 0.12772363313574242, 0.21065898367825722, 0.24465958845359234, 0.1955114898110033, 0.10355390084949237, 0.03414490375225675, 0.006231989845775931, 0.0004715577304677075 ]) mean = 14.808018384813426 var = 2.6085975877923717 # nchypergeom_wallenius.pmf returns 0 for pmf(0) and pmf(1), and pmf(2) # has only three digits of accuracy (~ 2.1511e-14). assert_allclose(nchypergeom_wallenius.pmf(sup, M, n, N, odds), pmf, rtol=1e-13, atol=1e-13) assert_allclose(nchypergeom_wallenius.mean(M, n, N, odds), mean, rtol=1e-13) assert_allclose(nchypergeom_wallenius.var(M, n, N, odds), var, rtol=1e-11) @pytest.mark.parametrize('dist_name', ['nchypergeom_fisher', 'nchypergeom_wallenius']) def test_rvs_shape(self, dist_name): # Check that when given a size with more dimensions than the # dimensions of the broadcast parameters, rvs returns an array # with the correct shape. dists = { 'nchypergeom_fisher': nchypergeom_fisher, 'nchypergeom_wallenius': nchypergeom_wallenius } dist = dists[dist_name] x = dist.rvs(50, 30, [[10], [20]], [0.5, 1.0, 2.0], size=(5, 1, 2, 3)) assert x.shape == (5, 1, 2, 3)
from scipy.stats import randint def plot_1D_function(x, y, y_name='y'): ax = plt.subplot(111) ax.plot(x, y, y_name) plt.legend(loc='best') plt.show() a_1 = np.linspace(0,10,100) a_2 = np.linspace(0,10,100) b_1 = np.linspace(0,10,100) b_2 = np.linspace(0,10,100) pi = np.linspace(0,1,10) input_space = np.linspace(0,1,1000) for i in range(5): pi_rvs = randint.rvs(0,10) a_1_rvs = randint.rvs(0,100) a_2_rvs = randint.rvs(0,100) b_1_rvs = randint.rvs(0,100) b_2_rvs = randint.rvs(0,100) bibeta_example_pdf = pi[pi_rvs]*beta.pdf(input_space,a_1[a_1_rvs],b_1[b_1_rvs]) + (1-pi[pi_rvs])*beta.pdf(input_space,a_2[a_2_rvs],b_2[b_2_rvs]) bibeta_example_cdf = pi[pi_rvs]*beta.cdf(input_space,a_1[a_1_rvs],b_1[b_1_rvs]) + (1-pi[pi_rvs])*beta.cdf(input_space,a_2[a_2_rvs],b_2[b_2_rvs]) ax = plt.subplot(111) ax.plot(input_space, pi[pi_rvs]*beta.pdf(input_space,a_1[a_1_rvs],b_1[b_1_rvs]), label="1 comp pdf") ax.plot(input_space, (1-pi[pi_rvs])*beta.pdf(input_space,a_2[a_2_rvs],b_2[b_2_rvs]), label="2 comp pdf") ax.plot(input_space, pi[pi_rvs]*beta.pdf(input_space,a_1[a_1_rvs],b_1[b_1_rvs]) + (1-pi[pi_rvs])*beta.pdf(input_space,a_2[a_2_rvs],b_2[b_2_rvs]), label="Mix pdf") plt.legend(loc='best') plt.show() ax = plt.subplot(111) ax.plot(input_space, pi[pi_rvs]*beta.cdf(input_space,a_1[a_1_rvs],b_1[b_1_rvs]), label="1 comp cdf") ax.plot(input_space, (1-pi[pi_rvs])*beta.cdf(input_space,a_2[a_2_rvs],b_2[b_2_rvs]), label="2 comp cdf")
def get_fake_output_df(n_columns): columns = [ 'sw_version', 'ct_identifier', 'equipment_type', 'heating_or_cooling', 'station', 'zipcode', 'climate_zone', 'start_date', 'end_date', 'n_days_in_inputfile_date_range', 'n_days_both_heating_and_cooling', 'n_days_insufficient_data', 'n_core_cooling_days', 'n_core_heating_days', 'baseline_percentile_core_cooling_comfort_temperature', 'baseline_percentile_core_heating_comfort_temperature', 'regional_average_baseline_cooling_comfort_temperature', 'regional_average_baseline_heating_comfort_temperature', 'percent_savings_baseline_percentile', 'avoided_daily_mean_core_day_runtime_baseline_percentile', 'avoided_total_core_day_runtime_baseline_percentile', 'baseline_daily_mean_core_day_runtime_baseline_percentile', 'baseline_total_core_day_runtime_baseline_percentile', '_daily_mean_core_day_demand_baseline_baseline_percentile', 'percent_savings_baseline_regional', 'avoided_daily_mean_core_day_runtime_baseline_regional', 'avoided_total_core_day_runtime_baseline_regional', 'baseline_daily_mean_core_day_runtime_baseline_regional', 'baseline_total_core_day_runtime_baseline_regional', '_daily_mean_core_day_demand_baseline_baseline_regional', 'mean_demand', 'alpha', 'tau', 'mean_sq_err', 'root_mean_sq_err', 'cv_root_mean_sq_err', 'mean_abs_err', 'mean_abs_pct_err', 'total_core_cooling_runtime', 'total_core_heating_runtime', 'total_auxiliary_heating_core_day_runtime', 'total_emergency_heating_core_day_runtime', 'daily_mean_core_cooling_runtime', 'daily_mean_core_heating_runtime', 'rhu_00F_to_05F', 'rhu_05F_to_10F', 'rhu_10F_to_15F', 'rhu_15F_to_20F', 'rhu_20F_to_25F', 'rhu_25F_to_30F', 'rhu_30F_to_35F', 'rhu_35F_to_40F', 'rhu_40F_to_45F', 'rhu_45F_to_50F', 'rhu_50F_to_55F', 'rhu_55F_to_60F', ] string_placeholder = ["PLACEHOLDER"] * n_columns zero_column = [0 if randint.rvs(0, 30) > 0 else (None if randint.rvs(0, 2) > 0 else np.inf) for i in randint.rvs(0, 1, size=n_columns)] one_column = [1 if randint.rvs(0, 30) > 0 else (None if randint.rvs(0, 2) > 0 else np.inf) for i in randint.rvs(0, 1, size=n_columns)] float_column = [i if randint.rvs(0, 30) > 0 else (None if randint.rvs(0, 2) > 0 else np.inf) for i in norm.rvs(size=n_columns)] zipcodes = ["01234", "12345", "23456", "34567", "43210", "54321", "65432", "76543"] zipcode_column = [i for i in islice(cycle(zipcodes), None, n_columns)] core_day_set_names = ["cooling_2012", "heating_2012-2013", "cooling_2013"] core_day_set_name_column = [i for i in islice(cycle(core_day_set_names), None, n_columns)] data = { 'sw_version': string_placeholder, 'ct_identifier': string_placeholder, 'equipment_type': string_placeholder, 'heating_or_cooling': core_day_set_name_column, 'station': string_placeholder, 'zipcode': zipcode_column, 'climate_zone': string_placeholder, 'start_date': datetime(2011, 1, 1), 'end_date': datetime(2012, 1, 1), 'n_days_both_heating_and_cooling': one_column, 'n_days_in_inputfile_date_range': one_column, 'n_days_insufficient_data': zero_column, 'n_core_heating_days': one_column, 'baseline_percentile_core_cooling_comfort_temperature': float_column, 'baseline_percentile_core_heating_comfort_temperature': float_column, 'regional_average_baseline_cooling_comfort_temperature': float_column, 'regional_average_baseline_heating_comfort_temperature': float_column, 'percent_savings_baseline_percentile': float_column, 'avoided_daily_mean_core_day_runtime_baseline_percentile': float_column, 'avoided_total_core_day_runtime_baseline_percentile': float_column, 'baseline_daily_mean_core_day_runtime_baseline_percentile': float_column, 'baseline_total_core_day_runtime_baseline_percentile': float_column, '_daily_mean_core_day_demand_baseline_baseline_percentile': float_column, 'percent_savings_baseline_regional': float_column, 'avoided_daily_mean_core_day_runtime_baseline_regional': float_column, 'avoided_total_core_day_runtime_baseline_regional': float_column, 'baseline_daily_mean_core_day_runtime_baseline_regional': float_column, 'baseline_total_core_day_runtime_baseline_regional': float_column, '_daily_mean_core_day_demand_baseline_baseline_regional': float_column, 'mean_demand': float_column, 'alpha': float_column, 'tau': float_column, 'mean_sq_err': float_column, 'root_mean_sq_err': float_column, 'cv_root_mean_sq_err': float_column, 'mean_abs_err': float_column, 'mean_abs_pct_err': float_column, 'total_core_cooling_runtime': float_column, 'total_core_heating_runtime': float_column, 'total_auxiliary_heating_core_day_runtime': float_column, 'total_emergency_heating_core_day_runtime': float_column, 'daily_mean_core_cooling_runtime': float_column, 'daily_mean_core_heating_runtime': float_column, 'rhu_00F_to_05F': float_column, 'rhu_05F_to_10F': float_column, 'rhu_10F_to_15F': float_column, 'rhu_15F_to_20F': float_column, 'rhu_20F_to_25F': float_column, 'rhu_25F_to_30F': float_column, 'rhu_30F_to_35F': float_column, 'rhu_35F_to_40F': float_column, 'rhu_40F_to_45F': float_column, 'rhu_45F_to_50F': float_column, 'rhu_50F_to_55F': float_column, 'rhu_55F_to_60F': float_column, } df = pd.DataFrame(data, columns=columns) return df
def get_fake_output_df(n_columns): columns = [ 'ct_identifier', 'equipment_type', 'season_name', 'station', 'zipcode', 'n_days_in_season_range', 'n_days_in_season', 'n_days_both_heating_and_cooling', 'n_days_insufficient_data', 'seasonal_savings_dailyavgCDD', 'seasonal_savings_dailyavgHDD', 'seasonal_savings_deltaT', 'seasonal_savings_hourlyavgCDD', 'seasonal_savings_hourlyavgHDD', 'seasonal_avoided_runtime_dailyavgCDD', 'seasonal_avoided_runtime_dailyavgHDD', 'seasonal_avoided_runtime_deltaT', 'seasonal_avoided_runtime_hourlyavgCDD', 'seasonal_avoided_runtime_hourlyavgHDD', 'total_auxiliary_heating_runtime', 'total_cooling_runtime', 'total_emergency_heating_runtime', 'total_heating_runtime', 'actual_daily_runtime', 'actual_seasonal_runtime', 'baseline_comfort_temperature', 'baseline_daily_runtime_dailyavgCDD', 'baseline_daily_runtime_dailyavgHDD', 'baseline_daily_runtime_deltaT', 'baseline_daily_runtime_hourlyavgCDD', 'baseline_daily_runtime_hourlyavgHDD', 'baseline_seasonal_runtime_dailyavgCDD', 'baseline_seasonal_runtime_dailyavgHDD', 'baseline_seasonal_runtime_deltaT', 'baseline_seasonal_runtime_hourlyavgCDD', 'baseline_seasonal_runtime_hourlyavgHDD', 'mean_demand_dailyavgCDD', 'mean_demand_dailyavgHDD', 'mean_demand_deltaT', 'mean_demand_hourlyavgCDD', 'mean_demand_hourlyavgHDD', 'mean_demand_baseline_dailyavgCDD', 'mean_demand_baseline_dailyavgHDD', 'mean_demand_baseline_deltaT', 'mean_demand_baseline_hourlyavgCDD', 'mean_demand_baseline_hourlyavgHDD', 'rhu_00F_to_05F', 'rhu_05F_to_10F', 'rhu_10F_to_15F', 'rhu_15F_to_20F', 'rhu_20F_to_25F', 'rhu_25F_to_30F', 'rhu_30F_to_35F', 'rhu_35F_to_40F', 'rhu_40F_to_45F', 'rhu_45F_to_50F', 'rhu_50F_to_55F', 'rhu_55F_to_60F', 'slope_deltaT', 'alpha_est_dailyavgCDD', 'alpha_est_dailyavgHDD', 'alpha_est_hourlyavgCDD', 'alpha_est_hourlyavgHDD', 'intercept_deltaT', 'deltaT_base_est_dailyavgCDD', 'deltaT_base_est_dailyavgHDD', 'deltaT_base_est_hourlyavgCDD', 'deltaT_base_est_hourlyavgHDD', 'mean_sq_err_dailyavgCDD', 'mean_sq_err_dailyavgHDD', 'mean_sq_err_deltaT', 'mean_sq_err_hourlyavgCDD', 'mean_sq_err_hourlyavgHDD', 'root_mean_sq_err_dailyavgCDD', 'root_mean_sq_err_dailyavgHDD', 'root_mean_sq_err_deltaT', 'root_mean_sq_err_hourlyavgCDD', 'root_mean_sq_err_hourlyavgHDD', 'cv_root_mean_sq_err_dailyavgCDD', 'cv_root_mean_sq_err_dailyavgHDD', 'cv_root_mean_sq_err_deltaT', 'cv_root_mean_sq_err_hourlyavgCDD', 'cv_root_mean_sq_err_hourlyavgHDD', 'mean_abs_err_dailyavgCDD', 'mean_abs_err_dailyavgHDD', 'mean_abs_err_deltaT', 'mean_abs_err_hourlyavgCDD', 'mean_abs_err_hourlyavgHDD', 'mean_abs_pct_err_dailyavgCDD', 'mean_abs_pct_err_dailyavgHDD', 'mean_abs_pct_err_deltaT', 'mean_abs_pct_err_hourlyavgCDD', 'mean_abs_pct_err_hourlyavgHDD', ] string_placeholder = ["PLACEHOLDER"] * n_columns zero_column = [0 if randint.rvs(0, 30) > 0 else (None if randint.rvs(0, 2) > 0 else np.inf) for i in randint.rvs(0, 1, size=n_columns)] one_column = [1 if randint.rvs(0, 30) > 0 else (None if randint.rvs(0, 2) > 0 else np.inf) for i in randint.rvs(0, 1, size=n_columns)] float_column = [i if randint.rvs(0, 30) > 0 else (None if randint.rvs(0, 2) > 0 else np.inf) for i in norm.rvs(size=n_columns)] zipcodes = ["01234", "12345", "23456", "34567", "43210", "54321", "65432", "76543"] zipcode_column = [i for i in islice(cycle(zipcodes), None, n_columns)] season_names = ["Cooling 2012", "Heating 2012-2013", "Cooling 2013"] season_name_column = [i for i in islice(cycle(season_names), None, n_columns)] data = { "ct_identifier": string_placeholder, "equipment_type": string_placeholder, "season_name": season_name_column, "station": string_placeholder, "zipcode": zipcode_column, "n_days_both_heating_and_cooling": one_column, "n_days_in_season": one_column, "n_days_in_season_range": one_column, "n_days_insufficient_data": zero_column, "seasonal_savings_deltaT": float_column, "seasonal_savings_dailyavgCDD": float_column, "seasonal_savings_dailyavgHDD": float_column, "seasonal_savings_hourlyavgCDD": float_column, "seasonal_savings_hourlyavgHDD": float_column, "seasonal_avoided_runtime_deltaT": float_column, "seasonal_avoided_runtime_dailyavgCDD": float_column, "seasonal_avoided_runtime_dailyavgHDD": float_column, "seasonal_avoided_runtime_hourlyavgCDD": float_column, "seasonal_avoided_runtime_hourlyavgHDD": float_column, "total_heating_runtime": float_column, "total_cooling_runtime": float_column, "total_auxiliary_heating_runtime": float_column, "total_emergency_heating_runtime": float_column, "actual_daily_runtime": float_column, "actual_seasonal_runtime": float_column, "baseline_comfort_temperature": float_column, "baseline_daily_runtime_deltaT": float_column, "baseline_daily_runtime_dailyavgCDD": float_column, "baseline_daily_runtime_dailyavgHDD": float_column, "baseline_daily_runtime_hourlyavgCDD": float_column, "baseline_daily_runtime_hourlyavgHDD": float_column, "baseline_seasonal_runtime_deltaT": float_column, "baseline_seasonal_runtime_dailyavgCDD": float_column, "baseline_seasonal_runtime_dailyavgHDD": float_column, "baseline_seasonal_runtime_hourlyavgCDD": float_column, "baseline_seasonal_runtime_hourlyavgHDD": float_column, "mean_demand_dailyavgCDD": float_column, "mean_demand_dailyavgHDD": float_column, "mean_demand_deltaT": float_column, "mean_demand_hourlyavgCDD": float_column, "mean_demand_hourlyavgHDD": float_column, "mean_demand_baseline_dailyavgCDD": float_column, "mean_demand_baseline_dailyavgHDD": float_column, "mean_demand_baseline_deltaT": float_column, "mean_demand_baseline_hourlyavgCDD": float_column, "mean_demand_baseline_hourlyavgHDD": float_column, "rhu_00F_to_05F": float_column, "rhu_05F_to_10F": float_column, "rhu_10F_to_15F": float_column, "rhu_15F_to_20F": float_column, "rhu_20F_to_25F": float_column, "rhu_25F_to_30F": float_column, "rhu_30F_to_35F": float_column, "rhu_35F_to_40F": float_column, "rhu_40F_to_45F": float_column, "rhu_45F_to_50F": float_column, "rhu_50F_to_55F": float_column, "rhu_55F_to_60F": float_column, "slope_deltaT": float_column, "alpha_est_dailyavgCDD": float_column, "alpha_est_dailyavgHDD": float_column, "alpha_est_hourlyavgCDD": float_column, "alpha_est_hourlyavgHDD": float_column, "intercept_deltaT": float_column, "deltaT_base_est_dailyavgCDD": float_column, "deltaT_base_est_dailyavgHDD": float_column, "deltaT_base_est_hourlyavgCDD": float_column, "deltaT_base_est_hourlyavgHDD": float_column, "mean_sq_err_dailyavgCDD": float_column, "mean_sq_err_dailyavgHDD": float_column, "mean_sq_err_deltaT": float_column, "mean_sq_err_hourlyavgCDD": float_column, "mean_sq_err_hourlyavgHDD": float_column, "root_mean_sq_err_dailyavgCDD": float_column, "root_mean_sq_err_dailyavgHDD": float_column, "root_mean_sq_err_deltaT": float_column, "root_mean_sq_err_hourlyavgCDD": float_column, "root_mean_sq_err_hourlyavgHDD": float_column, "cv_root_mean_sq_err_dailyavgCDD": float_column, "cv_root_mean_sq_err_dailyavgHDD": float_column, "cv_root_mean_sq_err_deltaT": float_column, "cv_root_mean_sq_err_hourlyavgCDD": float_column, "cv_root_mean_sq_err_hourlyavgHDD": float_column, "mean_abs_err_dailyavgCDD": float_column, "mean_abs_err_dailyavgHDD": float_column, "mean_abs_err_deltaT": float_column, "mean_abs_err_hourlyavgCDD": float_column, "mean_abs_err_hourlyavgHDD": float_column, "mean_abs_pct_err_dailyavgCDD": float_column, "mean_abs_pct_err_dailyavgHDD": float_column, "mean_abs_pct_err_deltaT": float_column, "mean_abs_pct_err_hourlyavgCDD": float_column, "mean_abs_pct_err_hourlyavgHDD": float_column, } df = pd.DataFrame(data, columns=columns) return df
import matplotlib.pyplot as plt from scipy.stats import randint import numpy as np # normal distribution = scipy.stats.norm(loc=100, scale=5) print distribution.stats("mvsk") # skewed distribution = scipy.stats.gengamma(100, 90, loc=50, scale=10) print distribution.stats("mvsk") sample = distribution.rvs(size=10000) sample = randint.rvs(0, 208, size=1000) pers = np.arange(1, 101, 1) # Make each of the last 41 elements 5x more likely prob = [1.0] * (len(pers) - 41) + [5.0] * 41 # Normalising to 1.0 prob /= np.sum(prob) sample = np.random.choice(pers, 1000, p=prob) plt.hist(sample) plt.show()
def z_enrichment_test(node_pvals_dict, the_grouping_dict, **kwargs): """ Perform enrichment analysis on the groupings in the_grouping_dict, using statistical aggregation based on the z-scores. Note this is similar to the statistical subnetwork scoring system used in: Ideker, T., Ozier, O., Schwikowski, B., & Siegel, A. (2002). Discovering regulatory and signalling circuits in molecular interaction networks. Bioinformatics, 18, 233–240. Except here we use the pre-defined groupings in the_grouping_dict rather than scanning for novel subnetworks. Arguments: node_pvals_dict: must have a subdictionary of node_id: 'p_uncorrected': (or 'p'). Note the p-value should be from a two-tailed test for changes. 't': optional, to deal with two-tailedness if doing signed z aggregation) p-values are assumed to result from two-tail tests and span [0 - 1]. 'z' is optional, if 'use_type' = z then it is needed the_grouping_dict: a dict of subsystem_id: [node_id_1, node_id_2, ...] kwargs: navg_node_sample: default is 100.0. The groupings are randomized such that each gene is sampled an average of 100 times. diagnostic: [False (default), True]: if True, a list of the randomly generated p-values will also be returned. aggregation_type: options are 'signed' or 'unsigned' 'unsigned': the z-value ranges from 0 to + inf. This method picks out gross changes in subsystems and ignored whether they are increasing or decreasing. 'signed': the z-value ranges from -inf to +inf. This method picks out coordinated changes in subsystems. Returns: grouping_scores_dict with keys agg_z = the aggregated z-value without background correction agg_adj_z = the aggregated z-value with background correction agg_p = a p-value resulting from a two-tail test for changes (e.g. near 0 is more significant) assuming the agg_adj_z is truly normally distributed. if diagnostic = True, random_scores_to_return is also returned. """ from numpy import nan, sign, mean, std, array, inf, zeros from numpy.random import rand from copy import deepcopy from random import sample, shuffle from scipy.stats import norm, randint diagnostic = test_kwarg('diagnostic', kwargs, [False, True]) aggregation_type = test_kwarg('aggregation_type', kwargs, ['unsigned', 'signed']) if 'navg_node_sample' in kwargs: navg_node_sample = kwargs['navg_node_sample'] else: navg_node_sample = 100.0 grouping_scores_dict = {} node_pvals_dictl = deepcopy(node_pvals_dict) random_scores_to_return = {} # First get the subsystems for subsystem in the_grouping_dict.keys(): # Enforce lower case to avoid duplication subsystem = subsystem.lower() if not(grouping_scores_dict.has_key(subsystem)): grouping_scores_dict[subsystem] = {} grouping_scores_dict[subsystem]['ind_p'] = [] if aggregation_type == 'signed': grouping_scores_dict[subsystem]['ind_t'] = [] grouping_scores_dict[subsystem]['agg_p'] = nan grouping_scores_dict[subsystem]['agg_z'] = nan grouping_scores_dict[subsystem]['agg_adj_z'] = nan grouping_scores_dict[subsystem]['ind_node'] = [] # Now add in pvals # Give preference to uncorrected p-values since Bonferroni corrected values # are truncated at 1. Don't need a multiple testing correction # since we are looking across the subsystem and correcting # would potentially lose information in detected differences. # We re-normalize p-values for subnetworks according to an # empirical null distribution at the end. test_node = node_pvals_dictl.keys()[0] if node_pvals_dictl[test_node].has_key('p_uncorrected'): p_key = 'p_uncorrected' else: p_key = 'p' for subsystem in the_grouping_dict.keys(): subsystem_lower = subsystem.lower() for the_node in the_grouping_dict[subsystem]: if the_node in node_pvals_dictl.keys(): grouping_scores_dict[subsystem_lower]['ind_p'].append(node_pvals_dictl[the_node][p_key]) grouping_scores_dict[subsystem_lower]['ind_node'].append(the_node) if aggregation_type == 'signed': grouping_scores_dict[subsystem_lower]['ind_t'].append(node_pvals_dictl[the_node]['t']) # Now aggregate. Make a lookuptable of size vs p values maxk = 0 for subsystem in grouping_scores_dict: if len(grouping_scores_dict[subsystem]['ind_p']) > maxk: maxk = len(grouping_scores_dict[subsystem]['ind_p']) meanlookup = [] sdlookup = [] # Haven't pressure tested the window with even numbers, # but odd values make more sense anyway. windowsize = 5 maxsize = int(maxk + 1 + int(round((windowsize-1)/2))) # It can be slow to compute system statistics, # so we need to be selective and just evaluate # around the sample sizes of interest k_to_evaluate = [] for subsystem in grouping_scores_dict: k = len(grouping_scores_dict[subsystem]['ind_p']) if k > 0: k = list(range(max(1,(k-(windowsize-1)/2)),(k+((windowsize-1)/2)+1))) k_to_evaluate.extend(deepcopy(k)) k_to_evaluate = list(set(k_to_evaluate)) k_to_evaluate.sort() # to speed calculations pre-convert to a z-score node_list = node_pvals_dictl.keys() pval_list = [node_pvals_dictl[curnode][p_key] for curnode in node_list] # Want to replace 0 or 1 pvals, # use the next nearest value filter_pval_list = [x for x in pval_list if ((x > 0) & (x < 1))] min_val = min(filter_pval_list) max_val = min(filter_pval_list) for i, x in enumerate(pval_list): if x >= 1: pval_list[i] = max_val if x <= 0: pval_list[i] = min_val if aggregation_type == 'signed': # Here, we aggregate using p-values # resulting from one-tailed tests # where p = 0.5 means no change and # decreases in expression imply negative z # when aggregating. # Convert our z first then take the sign # to minimize numerical issues. # This method is equivalent to # Stouffer's method. print "Warning, verify assumptions for signed averaging, this has not been done in a while." zval_list = norm.ppf(pval_list) tval_list = [node_pvals_dictl[curnode]['t'] for curnode in node_list] for index, t_val in enumerate(tval_list): if t_val > 0: zval_list[index] = -1 * zval_list[index] else: # Ideker 2002 and also Patil 2005 use an # undirected p-value when aggregating # Z-scores for p-values - e.g. they # use the "significance of the change" # where more negative z corresponds to # p ~ 1 and little change. zval_list = -1 * norm.ppf(pval_list) for k in k_to_evaluate: print('Simulating measures for subsystem number '+ str(k_to_evaluate.index(k)+1) + ' of '+ str(len(k_to_evaluate)) + '.') r_z_values = [] # Should need more trials with small k. # Rule of thumb: set size so all model # genes are sampled on the average > 7x # This and windowsize = 5 seem to be result in # fairly stable statistics from trial-and-error. ntrials = int(round(navg_node_sample*float(len(node_pvals_dictl))/float(k))) # Generate random indices between 0 and nmeasures - 1, as an array of size ntrials rows and k columns the_random_indices = randint.rvs(0, len(node_list), size=(ntrials, k)) # Faster to do this here as an array operation than call stouffer_z_agg random_score_distribution = array([sum(zval_list[x]) for x in list(the_random_indices)]) /(k**0.5) meanlookup.append(mean(random_score_distribution)) # Note SD's defined by this method are approximately size-independent sdlookup.append(std(random_score_distribution)) if diagnostic: random_score_distribution.sort() random_scores_to_return[k] = random_score_distribution # The SD as assessed here should be independent of size # apply a smoothing filter here first sdlookup = list(smooth(array(sdlookup), window_len=windowsize)) # The mean will be dependent on k**.5; To avoid edge effects # of the window first normalize then apply the smoothing filter for k in k_to_evaluate: meanlookup[k_to_evaluate.index(k)] = meanlookup[k_to_evaluate.index(k)] / (k ** 0.5) meanlookup = list(smooth(array(meanlookup), window_len=windowsize)) for k in k_to_evaluate: meanlookup[k_to_evaluate.index(k)] = meanlookup[k_to_evaluate.index(k)] * (k ** 0.5) # Re-normalize the mean before averaging for subsystem in grouping_scores_dict: k = len(grouping_scores_dict[subsystem]['ind_p']) if k > 0: node_indices = [index for index, node in enumerate(node_list) if node in grouping_scores_dict[subsystem]['ind_node']] grouping_scores_dict[subsystem]['agg_z'] = sum([zval_list[index] for index in node_indices]) / (k**0.5) grouping_scores_dict[subsystem]['agg_adj_z'] = (grouping_scores_dict[subsystem]['agg_z'] - meanlookup[k_to_evaluate.index(k)] )/ sdlookup[k_to_evaluate.index(k)] grouping_scores_dict[subsystem]['agg_p'] = norm.cdf(grouping_scores_dict[subsystem]['agg_adj_z']) # Convert back to a two-sided p value, this is twice the one-sided value if aggregation_type == 'signed': if grouping_scores_dict[subsystem]['agg_p'] < .5: grouping_scores_dict[subsystem]['agg_p'] = 2*grouping_scores_dict[subsystem]['agg_p'] else: grouping_scores_dict[subsystem]['agg_p'] = 2*(1-grouping_scores_dict[subsystem]['agg_p']) else: grouping_scores_dict[subsystem]['agg_p'] = 1- grouping_scores_dict[subsystem]['agg_p'] if diagnostic: grouping_scores_dict[subsystem]['z'] = norm.cdf(k_to_evaluate.index(k)) if not diagnostic: return grouping_scores_dict else: return grouping_scores_dict, random_scores_to_return
def distrib(param1,param2,param3,param4,egg): #param1 : list_element #param2 : config de la propriete #param3 : nom de la propriete #param4 : snapshot id #egg which is egg ############################ # qualitatif :begin ############################ if param2['domain']['type']=="qualitatif": ############################ # qualitatif sans ordre:begin ############################ if param2['domain']['order']=="false": no_succ_elements=list() succ_elements=list() for succ_key in param2["evolution"]["succesors"]: succ_elements.append(list()) ######################### division de l'ensemble des elements en plusieurs ensembles selon la regle de succession for param1_element in param1: succ_index=0 bool_succ=False for succ_key in param2["evolution"]["succesors"]: value_pr=egg[param1_element][param3][param4-1] if succ_key == value_pr : succ_elements[succ_index].append(param1_element) bool_succ=True succ_index=succ_index+1 if bool_succ == False: no_succ_elements.append(param1_element) ######################### end ######################### affectation des valeurs a no succ elements try: random = list(randint.rvs(0, len(param2['domain']['values']), size=len(no_succ_elements))) except: print '\n***************\n*\n* Error: Check the configuration of property: ',param3,'\n*\n***************' for elementId in no_succ_elements: egg[elementId][param3].insert(param4,param2['domain']['values'][random.pop()]) ######################### end ######################### affectation des valeurs aux elements de succ_elements succ_index =0 for succ_key in param2["evolution"]["succesors"]: succ_list=succ_elements[succ_index] try: random = list(randint.rvs(0, len(param2['evolution']['succesors'][succ_key]), size=len(succ_list))) except: print '\n***************\n*\n* Error: Check the configuration of property: ',param3,'\n*\n***************' for elementId in succ_list: egg[elementId][param3].insert(param4,param2['evolution']['succesors'][succ_key][random.pop()]) succ_index=succ_index+1 ######################### end ############################ # qualitatif sans ordre:fin ############################ ############################ # qualitatif avec ordre:begin ############################ else: offset_list=list() try: for m in range(0,param2['evolution']['offset']['max']-param2['evolution']['offset']['min']+1): ### m ne va pas jusqu au bout offset_list.append(param2['evolution']['offset']['min'] + m) except: print '\n***************\n*\n* Error: Check the configuration of property: ',param3,'\n*\n***************' ############################ # uniform:begin ############################ if param2['evolution']['offset']['distribution']['type']=="uniform": try: random = randint.rvs(0, len(offset_list), size=len(param1)) except: print '\n***************\n*\n* Error: Check the configuration of property: ',param3,'\n*\n***************' i=0 for param1_element in param1 : value_pr=egg[param1_element][param3][param4-1] # value of the previous element, needed for offset indice=param2['domain']['values'].index(value_pr) if len(param2['domain']['values'])-1<indice+offset_list[random[i]]:########## enter here only when indice+offset_list[random[0]] is bigger than the biggest index egg[param1_element][param3].insert(param4,param2['domain']['values'][len(param2['domain']['values'])-1])##### we take the last value elif indice+offset_list[random[i]]<0: egg[param1_element][param3].insert(param4,param2['domain']['values'][0]) ######### we take the first value else: egg[param1_element][param3].insert(param4,param2['domain']['values'][indice+offset_list[random[i]]]) i=i+1 ############################ # uniform:end ############################ ############################ # binom:begin ############################ if param2['evolution']['offset']['distribution']['type']=="binom": try: random = binom.rvs(len(offset_list)-1,param2['evolution']['offset']['distribution']["p"] , size=len(param1)) except: print '\n***************\n*\n* Error: Check the configuration of property: ',param3,'\n*\n***************' i=0 for param1_element in param1 : value_pr=egg[param1_element][param3][param4-1] # value of the previous element, needed for offset indice=param2['domain']['values'].index(value_pr) #logging.info( param3+param1+str(param4)+str(indice)+str(offset_list)+str(random[0])) if len(param2['domain']['values'])-1<indice+offset_list[random[i]]:########## enter here only when indice+offset_list[random[0]] is bigger than the biggest index egg[param1_element][param3].insert(param4,param2['domain']['values'][len(param2['domain']['values'])-1]) ##### we take the last value elif indice+offset_list[random[i]]<0: egg[param1_element][param3].insert(param4,param2['domain']['values'][0]) ######### we take the first value else: egg[param1_element][param3].insert(param4,param2['domain']['values'][indice+offset_list[random[i]]]) i=i+1 ############################ # binom:end ############################ ############################ # qualitatif avec ordre:end ############################ ############################ # qualitatif :end ############################ ############################ # quantitatif:dis :begin ############################ if param2['domain']['type']=="quantitatif:dis": offset_list=list() try: for m in range(0,param2['evolution']['offset']['max']-param2['evolution']['offset']['min']+1): ### jai ajoute 1 car il yavait un bug que je ne comprenais pas , m n allait pas jusqu au plus grand nombre offset_list.append(param2['evolution']['offset']['min'] + m) except: print '\n***************\n*\n* Error: Check the configuration of property: ',param3,'\n*\n***************' try: random = binom.rvs(len(offset_list)-1,param2['evolution']['offset']['distribution']["p"] , size=len(param1)) except: print '\n***************\n*\n* Error: Check the configuration of property: ',param3,'\n*\n***************' i=0 for param1_element in param1 : value_pr=egg[param1_element][param3][param4-1] # value of the previous element, needed for offset ### j ajoute -1 pour que random soit entre 0 et le plus grand indice de offset list qui est sa taille -1 previous_value=value_pr next_value=previous_value+offset_list[random[i]] if next_value < param2["domain"]["values"]["min"]: egg[param1_element][param3].insert(param4,param2["domain"]["values"]["min"]) elif next_value >param2["domain"]["values"]["max"]: egg[param1_element][param3].insert(param4,param2["domain"]["values"]["max"]) else: egg[param1_element][param3].insert(param4,next_value) i=i+1 ############################ # quantitatif:dis :end ############################ ############################ # quantitatif:con : begin ############################ if param2['domain']['type']=="quantitatif:con": random = norm.rvs(size=len(param1)) i=0 for param1_element in param1 : value_pr=egg[param1_element][param3][param4-1] # value of the previous element, needed for offset previous_value=value_pr try: offset = (random[i]*param2['evolution']['offset']['distribution']['sigma'])+param2['evolution']['offset']['distribution']['mean'] except: print '\n***************\n*\n* Error: Check the configuration of property: ',param3,'\n*\n***************' if offset > param2['evolution']['offset']['max']: offset = param2['evolution']['offset']['max'] elif offset < param2['evolution']['offset']['min']: offset = param2['evolution']['offset']['min'] next_value=round(previous_value+offset,1) if next_value < param2["domain"]["values"]["min"]: egg[param1_element][param3].insert(param4,param2["domain"]["values"]["min"]) elif next_value >param2["domain"]["values"]["max"]: egg[param1_element][param3].insert(param4,param2["domain"]["values"]["max"]) else: egg[param1_element][param3].insert(param4,next_value) i=i+1 ############################ # quantitatif:con : end ############################ return egg
def identify_reporter_metabolites(cobra_model, reaction_scores_dict, number_of_randomizations=1000, scoring_metric='default', score_type='p', entire_network=False, background_correction=True, ignore_external_boundary_reactions=False): """Calculate the aggregate Z-score for the metabolites in the model. Ignore reactions that are solely spontaneous or orphan. Allow the scores to have multiple columns / experiments. This will change the way the output is represented. cobra_model: A cobra.Model object TODO: CHANGE TO USING DICTIONARIES for the_reactions: the_scores reaction_scores_dict: A dictionary where the keys are reactions in cobra_model.reactions and the values are the scores. Currently, only supports a single numeric value as the value; however, this will be updated to allow for lists number_of_randomizations: Integer. Number of random shuffles of the scores to assess which are significant. scoring_metric: default means divide by k**0.5 score_type: 'p' Is the only option at the moment and indicates p-value. entire_network: Boolean. Currently, only compares scores calculated from the_reactions background_correction: Boolean. If True apply background correction to the aggreagate Z-score ignore_external_boundary_reactions: Not yet implemented. Boolean. If True do not count exchange reactions when calculating the score. """ # Add in a function to calculate based on correlation coefficients and to # deal with other multidimensional data. the_reactions = reaction_scores_dict.keys() the_scores = reaction_scores_dict.values() if score_type == 'p' and not hasattr(the_scores[0], '__iter__'): # minimum and maximum p-values are used to prevent numerical problems. # haven't decided whether an arbitrary min / max 1e-15 is preferred to # blunting the ends based on the values closest to 0 or 1. the_reactions = reaction_scores_dict.keys() the_scores = array(reaction_scores_dict.values()) minimum_p = min(the_scores[the_scores.nonzero()[0]]) maximum_p = max(the_scores[where(the_scores < 1)[0]]) the_scores[where(the_scores < minimum_p)] = minimum_p the_scores[where(the_scores > maximum_p)] = maximum_p the_scores = -norm.ppf(the_scores) # update the dictionary with the new scores reaction_scores_dict = dict(zip(the_reactions, the_scores)) elif hasattr(the_scores[0], '__iter__'): # In the case that the_scores is a list of lists, assume that each list # is the score for each reaction in the_reactions across all reactions. # Then for each metabolite, calculate the invnorm(|Pearson Correlation # Coefficient| for each reaction pair that it links. raise Exception("This isn't implemented yet") # Get the connectivity for each metabolite the_metabolites = set() for x in reaction_scores_dict: the_metabolites.update(x._metabolites) metabolite_scores = {} metabolite_connections = {} # Calculate the score for each metabolite for the_metabolite in the_metabolites: nonspontaneous_connections = [ x for x in the_metabolite._reaction if x.gene_reaction_rule.lower() not in ['s0001', ''] ] tmp_score = 0 number_of_connections = len(nonspontaneous_connections) for the_reaction in nonspontaneous_connections: if the_reaction not in reaction_scores_dict: if not entire_network: number_of_connections -= 1 continue else: tmp_score += reaction_scores_dict[the_reaction] metabolite_scores[the_metabolite] = tmp_score metabolite_connections[the_metabolite] = number_of_connections # NOTE: Doing the corrections based only on the significantly perturbed # scores is probably going to underestimate the significance. if background_correction: correction_dict = {} for i in set(metabolite_connections.values()): # if entire_network # add in a section to deal with the situation # where the entire network structure is considered by only have # p-values for a limited subset. # # Basically, what we're doing here is that for each i we select i # scores number_of_randomizations times the_random_indices = randint.rvs(0, len(the_scores), size=(number_of_randomizations, i)) random_score_distribution = array( [sum(the_scores[x]) for x in list(the_random_indices)]) / i**0.5 correction_dict[i] = [ mean(random_score_distribution), std(random_score_distribution, ddof=1) ] for the_metabolite, the_score in iteritems(metabolite_scores): number_of_connections = metabolite_connections[the_metabolite] if number_of_connections > 0: # Correct based on background distribution if background_correction: # if the list of scores is only for significant perturbations # then the background correction shouldn't be applied because # the current sampling method only takes into account # the_scores not the entire network. It'd be more accurate to # assign unscored reactions a default score. the_score = ((the_score / number_of_connections**.5) - correction_dict[number_of_connections][0]) / \ correction_dict[number_of_connections][1] else: the_score = the_score / number_of_connections**.5 # Update the score metabolite_scores[the_metabolite] = the_score return_dictionary = { 'scores': metabolite_scores, 'connections': metabolite_connections } if background_correction: return_dictionary['corrections'] = correction_dict return return_dictionary
def identify_reporter_metabolites(cobra_model, reaction_scores_dict, number_of_randomizations=1000, number_of_layers=1, scoring_metric='default', score_type='p', entire_network=False, background_correction=True, ignore_external_boundary_reactions=False): """Calculate the aggregate Z-score for the metabolites in the model. Ignore reactions that are solely spontaneous or orphan. Allow the scores to have multiple columns / experiments. This will change the way the output is represented. cobra_model: A cobra.Model object TODO: CHANGE TO USING DICTIONARIES for the_reactions: the_scores reaction_scores_dict: A dictionary where the keys are reactions in cobra_model.reactions and the values are the scores. Currently, only supports a single numeric value as the value; however, this will be updated to allow for lists number_of_randomizations: Integer. Number of random shuffles of the scores to assess which are significant. number_of_layers: 1 is the only option supported scoring_metric: default means divide by k**0.5 score_type: 'p' Is the only option at the moment and indicates p-value. entire_network: Boolean. Currently, only compares scores calculated from the_reactions background_correction: Boolean. If True apply background correction to the aggreagate Z-score ignore_external_boundary_reactions: Not yet implemented. Boolean. If True do not count exchange reactions when calculating the score. """ #Add in a function to calculate based on correlation coefficients and to #deal with other multidimensional data. the_reactions = reaction_scores_dict.keys() the_scores = reaction_scores_dict.values() if score_type == 'p' and not hasattr(the_scores[0], '__iter__'): #minimum and maximum p-values are used to prevent numerical problems. #haven't decided whether an arbitrary min / max 1e-15 is preferred to #blunting the ends based on the values closest to 0 or 1. the_reactions = reaction_scores_dict.keys() the_scores = array(reaction_scores_dict.values()) minimum_p = min(the_scores[the_scores.nonzero()[0]]) maximum_p = max(the_scores[where(the_scores < 1)[0]]) the_scores[where(the_scores < minimum_p)] = minimum_p the_scores[where(the_scores > maximum_p)] = maximum_p the_scores = -norm.ppf(the_scores) #update the dictionary with the new scores reaction_scores_dict = dict(zip(the_reactions, the_scores)) elif hasattr(the_scores[0], '__iter__'): #In the case that the_scores is a list of lists, assume that each list is #the score for each reaction in the_reactions across all reactions. Then #for each metabolite, calculate the invnorm(|Pearson Correlation #Coefficient| for each reaction pair that it links. raise Exception("This isn't implemented yet") #Get the connectivity for each metabolite the_metabolites = set() [the_metabolites.update(x._metabolites) for x in reaction_scores_dict]; metabolite_scores = {} metabolite_connections = {} #Calculate the score for each metabolite for the_metabolite in the_metabolites: nonspontaneous_connections = [x for x in the_metabolite._reaction if x.gene_reaction_rule.lower() not in ['s0001', '']] tmp_score = 0 number_of_connections = len(nonspontaneous_connections) for the_reaction in nonspontaneous_connections: if the_reaction not in reaction_scores_dict: if not entire_network: number_of_connections -= 1 continue else: tmp_score += reaction_scores_dict[the_reaction] metabolite_scores[the_metabolite] = tmp_score metabolite_connections[the_metabolite] = number_of_connections #NOTE: Doing the corrections based only on the significantly perturbed scores #is probably going to underestimate the significance. if background_correction: correction_dict = {} for i in set(metabolite_connections.values()): #if entire_network # add in a section to deal with the situation where #the entire network structure is considered by only have p-values for #a limited subset. # #Basically, what we're doing here is that for each i we select i #scores number_of_randomizations times the_random_indices = randint.rvs(0,len(the_scores), size=(number_of_randomizations, i)) random_score_distribution = array([sum(the_scores[x]) for x in list(the_random_indices)]) /i**0.5 correction_dict[i] = [mean(random_score_distribution), std(random_score_distribution,ddof=1)] for the_metabolite, the_score in metabolite_scores.iteritems(): number_of_connections = metabolite_connections[the_metabolite] if number_of_connections > 0: #Correct based on background distribution if background_correction: #if the list of scores is only for significant perturbations then the #background correction shouldn't be applied because the current sampling #method only takes into account the_scores not the entire network. #It'd be more accurate to assign unscored reactions a default score. the_score = ((the_score / number_of_connections**.5) - correction_dict[number_of_connections][0]) / \ correction_dict[number_of_connections][1] else: the_score = the_score / number_of_connections**.5 #Update the score metabolite_scores[the_metabolite] = the_score return_dictionary = {'scores': metabolite_scores, 'connections': metabolite_connections} if background_correction: return_dictionary['corrections'] = correction_dict return(return_dictionary)