Exemplo n.º 1
def train_MLP(min_size=5, max_size=100):
    model = MLPClassifier(max_iter=100)
    param_grid = {
        'alpha': [1],
        'max_iter': [1000],
        'solver': ['adam'],
        'activation': ['relu']
    param_grid = {
                         max_size), sp_randint.rvs(min_size, max_size),
                         max_size), sp_randint.rvs(min_size, max_size)),
                         max_size), sp_randint.rvs(min_size, max_size),
          sp_randint.rvs(min_size, max_size)),
                         max_size), sp_randint.rvs(min_size, max_size)),
         (sp_randint.rvs(min_size, max_size), )],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam', 'lbfgs'],
        'learning_rate': ['constant', 'adaptive'],
    return model, param_grid
Exemplo n.º 2
def die_roll(loot_count):
    # roll the dice
    roll = randint.rvs(1, 4) + randint.rvs(1, 4)

    # reduce the roll by the amount of loot caried, but not below zero
    roll = roll - loot_count
    if roll < 0:
        roll = 0

    # return the modified roll
    return roll
Exemplo n.º 3
 def draw_generation(self):
     self.draws = []
     for i in range(self.times):
         draw_dict = {}
         draw_dict['beta_draw'] = uniform.rvs(0,1,500)
         draw_dict['bernoulli_draw'] = uniform.rvs(0,1,[500,1000])
         draw_dict['uniform_draw'] = uniform.rvs(0,1,500)
         draw_dict['host_draw'] = uniform.rvs(0,1,1000)
         draw_dict['size_state'] = randint.rvs(0,2000)
         draw_dict['initial_state'] = randint.rvs(0,2000)
Exemplo n.º 4
def r_funct(current_key,str_values):
	df_train = pd.DataFrame.from_records(str_values,columns = df_columns)	
	#0. convert to proper dtypes
	for col,coltype in data_type_dict.iteritems():
		if coltype=='int64':
			df_train[col] = df_train[col].astype(int)
		if coltype=='float64':
			df_train[col] = df_train[col].astype(float)
	#1. remove constant columns
	remove = []
	for col in df_train.columns:
		if df_train[col].std() == 0:
	df_train = df_train.drop(remove, axis=1)
	#2. remove duplicated columns
	remove = []
	c = df_train.columns
	for i in range(len(c)-1):
		v = df_train[c[i]].values
		for j in range(i+1,len(c)):
			if np.array_equal(v,df_train[c[j]].values):
	df_train = df_train.drop(remove, axis=1)

	y_train = df_train['TARGET'].values
	X_train = df_train.drop(['ID','TARGET'], axis=1).values

	# params for this randomforest
	len_train = len(X_train)
	n_estimators=sp_randint.rvs(100, 5000)
	min_samples_split=sp_randint.rvs(2, 11)
	min_samples_leaf=sp_randint.rvs(1, 11)
	max_depth=sp_randint.rvs(2, 20)

	# kfold cross validation for train data using randomforest.
	clf = GradientBoostingClassifier(learning_rate=learning_rate,n_estimators=n_estimators,subsample=subsample,min_samples_split=min_samples_split,min_samples_leaf=min_samples_leaf,max_depth=max_depth)
	k_fold = cross_validation.KFold(len_train, 5)
	auc_scores_list = []
	for k, (train, test) in enumerate(k_fold):
		clf.fit(X_train[train], y_train[train])
		auc_scr = auc_score(y_train[test], clf.predict_proba(X_train[test])[:,1])
	mean = np.mean(auc_scores_list)
	std = np.std(auc_scores_list)

	print "GBT:learning_rate:%s,n_estimators:%s,subsample:%s,min_samples_split:%s,min_samples_leaf:%s,min_weight_fraction_leaf:%s,max_depth:%s,mean:%s,std:%s" %(learning_rate,n_estimators,subsample,min_samples_split,min_samples_leaf,min_weight_fraction_leaf,max_depth,mean,std)
Exemplo n.º 5
 def __benchmarkfun(self, talent, benchmarksize):
     R1 = randint.rvs(1, 20, size=benchmarksize)
     R2 = randint.rvs(1, 20, size=benchmarksize)
     R3 = randint.rvs(1, 20, size=benchmarksize)
     R = zip(R1, R2, R3)
     testcount = 1
     testresults = []
     for throw in R:
         testcount += 1
         testresults.append(self.test_silent(talent, throw[0], throw[1], throw[2]))
     return testresults
Exemplo n.º 6
def supermarket_log(starting_time, finish_time, warehouse,
                    file):  # one day operation
    Simulating one day of restock and sells in a supermarket, the events in the supermarket follow an exponential
    distribution with an average time between events of 5 minutes. Each time that an event occur, the next event
    (restock or sell) is chosen with a binomial distribution where a sell has probability 0.65 and a restock 0.35.
    When a client buy a product, that product is selected randomly uniformly, whereas the quantity is chosen
    from a binomial with n=(max quantity of the product chosen) and p=0.15
    We are making one restock at once, each time that a restock is made the product selected is randomly uniformly
    chosen, meanwhile the quantity of the product to restock is chosen from a binomial where
    n=(max quantity allowed in shelves), p=0.65

        supermarket opening time
        supermarket closing time
        class Warehouse where our products catalog is saved, we need this information to know products and their codes
        in our supermarket
        file path in which save our daily log

    log = []
    last_hour = starting_time
    while last_hour < finish_time:  # our loop finish when the last transaction has passed finish_time
        if binom.rvs(1, 0.65):
            product_chosen = list(
                warehouse.products.keys())[randint.rvs(1, 19) - 1]
            last_hour += timedelta(minutes=float(expon.rvs(scale=5, size=1)))
            aux = [
                'venta', last_hour, product_chosen,
                binom.rvs(n=warehouse[product_chosen][0], p=0.15, loc=1)
            last_hour += timedelta(minutes=float(expon.rvs(scale=5, size=1)))
            product_chosen = list(warehouse.products.keys())[randint.rvs(
                len(amazon.products) - 1)]
                'repo', last_hour, product_chosen,
                binom.rvs(n=warehouse[product_chosen][0], p=0.65, loc=1)
    with open(file, 'w') as f:
        text = ""
        for el in log:
            text += el[0] + ' ' + format_date(el[1]) + " " + el[2] + " " + str(
                el[3]) + "\n"
Exemplo n.º 7
 def rvs(self):
     if not self.size:
         self.size = randint.rvs(low = self.min_size, high = self.max_size, size = 1)
     if self.scale:
         return expon.rvs(loc = self.loc * 0.09, scale = self.scale, size = self.size)
         return expon.rvs(loc = self.loc * 0.09, scale = self.loc * 8.0, size = self.size)
def metadata_filename():
    columns = [

    n_thermostats = 100
    thermostat_ids = [uuid4() for i in range(n_thermostats)]
    equipment_types = randint.rvs(0, 6, size=n_thermostats)
    zipcodes = [
        "70754", "70722", "70726", "70449", "70442", # "722312" 50
        "70443", "70441", "70446", "70447", "70444", # "722312"
        "70836", "70778", "70770", "70774", "70777", # "722312"
        "70433", "70437", "70436", "70435", "70438", # "722312"
        "70744", "70748", "70462", "70465", "70466", # "722312"
        "70791", "70714", "70711", "70451", "70450", # "722312"
        "70453", "70455", "70454", "70456", "70809", # "722312"
        "70806", "70807", "70805", "70769", "70761", # "722312"
        "70402", "70403", "70401", "70737", "70730", # "722312"
        "70733", "70739", "70785", "70789", "70706", # "722312"
        "45341", "45344", "45349", "45319", "45434", # "745700" 55
        "60018", "60191", "60193", "60195", "60194", # "725300" 60
        "97473", "97449", "97493", "97467", "97459", # "726917" 65
        "60421", "60544", "60404", "60408", "60481", # "725345" 70
        "36590", "36564", "36606", "36605", "36532", # "722235" 75
        "36541", "36544", "36568", "36608", "36609", # "722230" 80
        "23106", "23060", "23229", "23222", "23294", # "724029" 85
        "13674", "13601", "13606", "13605", "13682", # "726227" 90
        "12978", "12972", "12985", "12903", "12901", # "726225" 95
        "61051", # "725326" 96
        "76207", # "722589" 97
        "36362", # "722239" 98
        "57233", # "726546" 99
        "56289", # "726547" 100
    utc_offsets = [-7 for _ in range(n_thermostats)]
    interval_data_filenames = ["thermostat_{}.csv".format(i) for i in thermostat_ids]

    df = pd.DataFrame({
        "thermostat_id": thermostat_ids,
        "equipment_type": equipment_types,
        "zipcode": zipcodes,
        "utc_offset": utc_offsets,
        "interval_data_filename": interval_data_filenames,
    }, columns=columns)

    temp_dir = tempfile.mkdtemp()
    metadata_filename = os.path.join(temp_dir, "metadata.csv")
    df.to_csv(metadata_filename, index=False)

    for interval_data_filename in df.interval_data_filename:
        fname = os.path.join(temp_dir, interval_data_filename)
        with open(fname, 'w') as f :
            f.write("INTERVAL DATA FILE CONTENT")

    return metadata_filename
Exemplo n.º 9
 def draw(self, K = 10, N = 1*10**5, m = 3, gaussian = False):
     if self.seed is not None:
     alphas = gamma.rvs(5, size=m)               # shape parameter
     #print(sum(alphas))                              # equivalent sample size
     self.p = dirichlet.rvs(alpha = alphas, size = 1)[0]
     self.phi_is = multinomial.rvs(1, self.p, size=N)       # draw from categorical p.m.f
     self.x_draws = np.zeros((N,K))
     self.hyper_loc, self.hyper_scale, self.thetas, self.var, self.covs, self.rdraws = dict(), dict(), dict(), tuple(), tuple(), tuple()
     for i in range(m):
           self.hyper_loc["mean"+str(i+1)] = norm.rvs(size = 1, loc = 0, scale = 5)
           self.hyper_scale["scale"+str(i+1)] = 1/gamma.rvs(5, size=1)
           self.thetas["mean"+str(i+1)] = norm.rvs(size = K, loc = self.hyper_loc["mean"+str(i+1)], 
                       scale = self.hyper_scale["scale"+str(i+1)])
           self.thetas["Sigma"+str(i+1)] = np.eye(K)*(1/gamma.rvs(5, size=K))
           self.thetas["nu"+str(i+1)] = randint.rvs(K+2, K+10, size=1)[0]
           if gaussian:
              self.covs += (self.thetas['Sigma'+str(i+1)], )
              self.covs += (wishart.rvs(df = self.thetas['nu'+str(i+1)], scale = self.thetas['Sigma'+str(i+1)], size=1),)
              self.var += (self.thetas["nu"+str(i+1)]/(self.thetas["nu"+str(i+1)]-2)*self.covs[i],)       # variance covariance matrix of first Student-t component
           self.rdraws += (np.random.multivariate_normal(self.thetas["mean"+str(i+1)], self.covs[i], N),)
           self.Phi = np.tile(self.phi_is[:,i], K).reshape(K,N).T              # repeat phi vector to match with random matrix
           self.x_draws += np.multiply(self.Phi, self.rdraws[i])                
     return self.x_draws
Exemplo n.º 10
def death_drop(inventory):
    # create a vector of zeroes
    inventory_mask = np.zeros(len(inventory))

    # select a token at random
    inventory_mask[randint.rvs(0, len(inventory))] = 1

    return inventory_mask
Exemplo n.º 11
def main(N, fl):
    X = randint.rvs(2, 65536, size=N)
    #fl = "SeqS.in"
    fd = open(fl, 'w')
    for x in X:
Exemplo n.º 12
    def draw(self, K=10, N=1 * 10**5, m=3, gaussian=False):
        N: sample size
        K: Dimension of Normal/Student distr.
        m: number of mixture components
        self.st0 = np.random.get_state()  # get initial state of RNG
        print("Drawing from", m, "component mixture distribution.")
        alphas = gamma.rvs(5, size=m)  # shape parameter
        #print(sum(alphas))                              # equivalent sample size
        self.p = dirichlet.rvs(alpha=alphas, size=1)[0]
        self.phi_is = multinomial.rvs(1, self.p,
                                      size=N)  # draw from categorical p.m.f

        self.x_draws = np.zeros((N, K))
        self.hyper_loc, self.hyper_scale, self.thetas, self.var, self.covs, self.rdraws = dict(
        ), dict(), dict(), tuple(), tuple(), tuple()

        for i in range(m):

            self.hyper_loc["mean" + str(i + 1)] = norm.rvs(size=1,
            self.hyper_scale["scale" + str(i + 1)] = 1 / gamma.rvs(5, size=1)

            self.thetas["mean" + str(i + 1)] = norm.rvs(
                loc=self.hyper_loc["mean" + str(i + 1)],
                scale=self.hyper_scale["scale" + str(i + 1)])
            self.thetas["Sigma" +
                        str(i + 1)] = np.eye(K) * (1 / gamma.rvs(5, size=K))
            self.thetas["nu" + str(i + 1)] = randint.rvs(K + 2, K + 10,

            if gaussian:
                self.covs += (self.thetas['Sigma' + str(i + 1)], )
                self.covs += (wishart.rvs(df=self.thetas['nu' + str(i + 1)],
                                          scale=self.thetas['Sigma' +
                                                            str(i + 1)],
                                          size=1), )
                self.var += (
                    self.thetas["nu" + str(i + 1)] /
                    (self.thetas["nu" + str(i + 1)] - 2) * self.covs[i],
                )  # variance covariance matrix of first Student-t component
            self.rdraws += (np.random.multivariate_normal(
                self.thetas["mean" + str(i + 1)], self.covs[i], N), )

            self.Phi = np.tile(self.phi_is[:, i], K).reshape(
                K, N).T  # repeat phi vector to match with random matrix
            self.x_draws += np.multiply(self.Phi, self.rdraws[i])

        return self.x_draws, np.argmax(self.phi_is, 1)  # X, latent
Exemplo n.º 13
def compare_models(player_agent_1, player_agent_2, games=1000):
    # initiate counter to track win totals
    wins = np.zeros(2)

    for i in range(games):
        # initiate a new game state
        game = GameState(6)

        # start turn counter; declare a flag for game end
        turn = 0
        continue_game = True

        # randomly assign player_agent_1 and player_agent_2 to even/odd turns
        player_agent_1_turns = randint.rvs(0, 2)

        # take turns until the game is over
        while continue_game:
            # use player_agent_1 on half the turns
            if turn % 2 == player_agent_1_turns:
                # take a turn
                continue_game, turn_taken = take_turn(game,

            # use player_agent_2 on the other half
                # take a turn
                continue_game, turn_taken = take_turn(game,

            # skip to next turn if active player is already back at the sub
            if turn_taken == False:
                # update the active player
                if game.active_player < game.players - 1:
                    game.active_player += 1
                    game.active_player = 0

                # next turn

            # update the active player
            if game.active_player < game.players - 1:
                game.active_player += 1
                game.active_player = 0

            # increment turn counter
            turn += 1

        # document the winner
        if np.argmax(game.player_scores) % 2 == player_agent_1_turns:
            wins[0] += 1
            wins[1] += 1

    # return the win totals
    return wins
Exemplo n.º 14
 def test_gaussiankde_arguments(self):
     size = 1000
     low = 0
     high = 9
     data = randint.rvs(low, high, size=size) + norm.rvs(0, 0.1, size=size)
     dist = GaussianMultivariate(distribution=GaussianKDE(bw_method=0.01))
     samples = dist.sample(size).to_numpy()[0]
     d, p = ks_2samp(data, samples)
     assert p >= 0.05
Exemplo n.º 15
 def get_seed_value(self, new_seed=False):
     # returns value of seed specified in the text_widget
     # if that value is zero, return the seed stored in self.seed_value
     # if that value is zero, produce a new seed if new_seed is true
     value = self.seed_text_widget.value
     if value == 0:
         if new_seed:
             self.seed_value = randint.rvs(100000, 999999)
         value = self.seed_value
     return value
    def treatment_effect(self, X=None, y=None, t=None):
        self.X_ = X
        self.y_ = y
        self.t_ = t

        fold_seeds = randint.rvs(0, 1000, size=self.n_splits,
        treatment_effect = self._dml_estimation(fold_seeds)
        self._is_estimated = True

        return treatment_effect
def pick_one_numbers_uniformly(low, high):
    Retourne un entier précisé par size entre low et high
    :param int low: plus pétit élément probable
    :param int high: plus grand élément probable
    :param int length:  taille de l'échantillon
    :return int item:
    item = low
    if low != high:
        item = list(randint.rvs(low, high, size=1))[0]
    return item
 def rvs(self):
     if not self.size:
         self.size = randint.rvs(low=self.min_size,
     if self.scale:
         return expon.rvs(loc=self.loc * 0.09,
         return expon.rvs(loc=self.loc * 0.09,
                          scale=self.loc * 8.0,
Exemplo n.º 19
    def rvs(self, random_state=42):
        if len(self.seen) < self.high - self.low - 1:
            while True:
                sample = randint.rvs(self.low,

                if sample not in self.seen:
                    return self.size * (sample, )

        return self.size * (0, )
Exemplo n.º 20
    def wild_bootstrap(self, beta_null, var):
        # beta_null is the null hypothesis for var
        X1 = self.df[var].to_numpy()
        # get a list of all variables that are not rel time 1 dummy
        Xvars_no1 = [v for v in self.xvars if v != var[0]]
        # perform a regression without rel time 1 dummy
        Y1 = self.Y - beta_null * X1
        Xno1 = self.df[Xvars_no1].to_numpy()
        beta1 = np.linalg.solve(Xno1.T.dot(Xno1), Xno1.T.dot(Y1))
        # use the beta to construct the Us
        U = Y1 - np.dot(Xno1, beta1)
        # boostrap the Us
        rand_sign = 2 * randint.rvs(0, 2, size=self.N).reshape(self.N, 1) - 1
        newU = np.multiply(U, rand_sign)
        # construct the wild Y
        Ywild = np.dot(Xno1, beta1) + X1 * beta_null + newU
        # get the new beta from the wild Y
        beta_wild = np.dot(self.XpXi, np.dot(self.X.T, Ywild))

        error = Ywild - np.dot(self.X, beta_wild)
        # and now the clustered-robust std error with similar procedure
        # as above
        clustervars = ['index']
        newdf = copy.copy(self.df)
        predictedX = np.sum(np.multiply(self.theta.T, self.X), 1).to_frame()
        predictedX = predictedX.rename(columns={0: "Yhat"})
        # add the residual to the dataframe
        df_withresid = newdf
        newdf['e'] = error

        # .assign(e = lambda x: x[self.yvar] - \
        # 	predictedX.Yhat)
        df_withresid[clustervars] = self.df[clustervars]
        # group by the cluster
        groups = df_withresid.groupby(clustervars)
        G = len(groups)
        robust_sum = 0
        # cycle through each cluster and create cluster-specific "meat"
        for key, item in groups:
            Xgroup = item[self.xvars].to_numpy()
            egroup = item['e'].to_numpy()
            egroup = egroup.reshape(len(egroup), 1)
            cluster_sum = np.matmul(np.matmul(np.matmul(Xgroup.T, egroup), \
             egroup.T), Xgroup)
            robust_sum += cluster_sum
        # correct for degrees of freedom
        deg_freedom = (G / (G - 1)) * ((self.N - 1) / (self.N - self.k))
        # sandwich together with the bread defined in the class initialization
        V = deg_freedom * np.matmul(np.matmul(self.XpXi.T, robust_sum), \
        return np.sqrt(np.diag(V))
Exemplo n.º 21
    def singlevisualize(self,result):
        pylab.rcParams['figure.figsize'] = (30.0, 20.0)
        for k in result.keys():
            avo.plot(self.keywords[k],[x['AverageOpinion'] for x in result[k]],label='AverageOpinion',marker='o')
            avo.plot(self.keywords[k],[x['parameter']['bm'] for x in result[k]],label='host-m',marker='1')
            avo.plot(self.keywords[k],[x['parameter']['bl'] for x in result[k]],label='host-l',marker='2')
            avo.plot(self.keywords[k],[x['AverageHost'] for x in result[k]],label='host-ave',marker='3')
            con.plot(self.keywords[k],[x['ConversionRatio'][0] for x in result[k]],label='PosToNeg',marker='2')
            con.plot(self.keywords[k],[x['ConversionRatio'][1] for x in result[k]],label='NegToPos',marker='1')
            for i in result[k]:
                xaxis=[(i['FinalRatio'][1][x]+i['FinalRatio'][1][x+1])/2 for x in range(0,20)]
                yaxis=[y/(500*self.times) for y in i['FinalRatio'][0]]
            avo.set_ylabel('Average Opinion')
            fr.set_xlabel('Opinion Value')
            fr.set_ylabel('Final Ratio')
            con.set_ylabel('Conversion Ratio')

        return result#result is a dictionary
Exemplo n.º 22
 def flow(self, member, queue_node):
     prob_staying_general = 1 - Network.probability_of_leaving
     prob_stay_level = (1 - self.proportion_leave[member.level])*prob_staying_general   # Added prob of leaving per level
     stay = bernoulli.rvs(prob_stay_level)
     if member.level == 8:   # There's also probability they will leave
     elif not stay:
         next_level = member.level + 1
         # Determine the next node to visit
         queue_choice = None
         min_cost = 0
         wait_cost = 0
         service_cost = 0
         if self.asn_policy == 'Deterministic Wait':
             edges = queue_node.outgoing_edges
             min_cost = edges[0].get_wait_cost(member) + 1   # At the least, queue 0 has lower cost
             for q in range(0, len(edges)):
                 if min_cost > edges[q].get_wait_cost(member):
                     min_cost = edges[q].get_wait_cost(member)
                     queue_choice = edges[q].exit
                     wait_cost = min_cost
                     service_cost = edges[q].get_service_cost(member)
         if self.asn_policy == 'Deterministic Service':
             edges = queue_node.outgoing_edges
             min_cost = edges[0].get_service_cost(member) + 1   # At the least, queue 0 has lower cost
             for q in range(0, len(edges)):
                 if min_cost > edges[q].get_service_cost(member):
                     min_cost = edges[q].get_service_cost(member)
                     queue_choice = edges[q].exit
                     service_cost = min_cost
                     wait_cost = edges[q].get_wait_cost(member)
         if self.asn_policy == 'Uniform':
             choice_total = len(self.network[next_level])
             queue_index = randint.rvs(1, choice_total, size=1)[0]
             queue_choice = self.network[next_level][queue_index]
         yield self.env.process(member.request(queue_choice, queue_node, wait_cost, service_cost, self.env))
         # if we are still in the network
         self.flow(member, queue_choice)
Exemplo n.º 23
    def push(self, member):
        arriving_level = member.level
        # For now, we choose any node in level N+1
        if member.level == 8:
        p = 1 - self.proportion_leave[arriving_level]   # Probability they are assessed and leave
        stay = bernoulli.rvs(p)
        if not stay:
        arrival_node = self.network[arriving_level][0]  # Arriving node
        queue_choice = None
        wait_cost = 0
        service_cost = 0
        min_cost = 0
        if self.asn_policy == 'Deterministic Wait':
            edges = arrival_node.edges
            min_cost = edges[0].get_wait_cost(member) + 1   # At the least, queue 0 has lower cost
            for q in range(0, len(edges)):
                if min_cost > edges[q].get_wait_cost(member):
                    min_cost = edges[q].get_wait_cost(member)
                    wait_cost = min_cost
                    service_cost = edges[q].get_service_cost(member)
                    queue_choice = edges[q].exit
        if self.asn_policy == 'Deterministic Service':
            edges = arrival_node.edges
            min_cost = edges[0].get_service_cost(member) + 1   # At the least, queue 0 has lower cost
            for q in range(0, len(edges)):
                if min_cost > edges[q].get_service_cost(member):
                    min_cost = edges[q].get_service_cost(member)
                    queue_choice = edges[q].exit
                    wait_cost = edges[q].get_wait_cost(member)
                    service_cost = min_cost
        if self.asn_policy == 'Uniform':
            choice_total = len(self.network[arriving_level+1])
            queue_index = randint.rvs(1, choice_total, size=1)[0]
            queue_choice = self.network[arriving_level+1][queue_index]

        yield self.env.process(member.request(queue_choice, arrival_node, wait_cost, service_cost, self.env))
        self.env.process(self.flow(member, queue_choice))
Exemplo n.º 24
 def tree_sim(self, cur_state, action):
     if cur_state is self.death:
         if action is self.cut:
             next_state = self.sappling_height
             reward = -self.replanting_cost
             next_state = self.death
             reward = 0
         if action is self.cut:
             next_state = self.sappling_height
             reward = self.linear_wood_value * cur_state - self.replanting_cost
             tree_is_dying = bernoulli.rvs(self.proba_of_dying)
             if tree_is_dying:
                 next_state = self.death
                 reward = -self.maintenance_cost
                 next_state = randint.rvs(cur_state, self.max_height + 1)
                 reward = -self.maintenance_cost
     return next_state, reward
Exemplo n.º 25
def compare_models(model_set_1, model_set_2, games=1000, noise=0.1):
    # initiate counter to track win totals
    wins = np.zeros(2)

    for i in range(games):
        # initiate a new game state
        game = GameState(6)

        # start turn counter; declare a flag for game end
        turn = 0
        continue_game = True

        # randomly assign model_set_1 and model_set_2 to even/odd turns
        model_set_1_turns = randint.rvs(0, 1)

        # take turns until the game is over
        while continue_game:
            # use model_set_1 on half the turns
            if turn % 2 == model_set_1_turns:
                # take a turn
                continue_game, turn_taken, turn_around, pick_up, drop = take_turn(
                    game, *model_set_1, noise)

            # use model_set_2 on the other half
                # take a turn
                continue_game, turn_taken, turn_around, pick_up, drop = take_turn(
                    game, *model_set_2, noise)

            # increment turn counter
            turn += 1

        # document the winner
        if np.argmax(game.player_scores) % 2 == model_set_1_turns:
            wins[0] += 1
            wins[1] += 1

    # return the win totals
    return wins
Exemplo n.º 26
    def drop_decision(self, gamestate):
        # decide if the model output or a random guess will be used
        if uniform.rvs(0, 1) <= self.epsilon:
            # no drop if inventory is empty
            if sum(gamestate[1:33] != -1) == 0:
                drop = 0
                # randomly decide whether to drop a token from those available
                drop = randint.rvs(0, sum(gamestate[1:33] != -1) + 1)
            # generate a Q-table for the current gamestate
            selected_action = self.pick_up_model.predict(
                np.reshape(gamestate, (1, gamestate.shape[0])))

            # take the action with the highest Q-value
            drop = np.argmax(selected_action[0:(sum(gamestate[1:33] != -1) +
            drop = int(drop)

        # return the decision as an integer
        # 1-33 mean drop the corresponding item
        # 0 means no drop
        return drop

preprocessor = ColumnTransformer([
    ('numeric_transformer', numeric_transformer, numeric_features),
    ('categorical_transformer', categorical_transformer, categorical_features)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', classifier)

param_distributions = {
                        "classifier__learning_rate": uniform.rvs(0.0001, 0.1, size=x_rscv_n_iter),
                        "classifier__gamma" : uniform.rvs(0, 2, size=x_rscv_n_iter),
                        "classifier__max_depth": randint.rvs(2, 100, size=x_rscv_n_iter),
                        "classifier__colsample_bytree": uniform.rvs(0.1, 0.9, size=x_rscv_n_iter),
                        "classifier__subsample": uniform.rvs(0.1, 0.9, size=x_rscv_n_iter),
                        "classifier__reg_alpha": uniform.rvs(0, 0.9, size=x_rscv_n_iter),
                        "classifier__reg_lambda": uniform.rvs(0.0001, 5, size=x_rscv_n_iter),
                        "classifier__min_child_weight": randint.rvs(1, 7, size=x_rscv_n_iter),
                        "classifier__n_estimators": randint.rvs(100, 1000, size=x_rscv_n_iter)

search = RandomizedSearchCV(
    pipeline, param_distributions=param_distributions, n_iter=x_rscv_n_iter, scoring={'recall_score': recall_scorer, 'f1_score': f1_scorer}, 
    n_jobs=-1, cv=x_rscv, random_state=x_random_state, refit='f1_score', return_train_score=True)

search = search.fit(X, y)

print(datetime.now()," [3/4] Algorithmus hat zu Ende berechnet")
Exemplo n.º 28
def get_fake_output_df(n_columns):
    columns = [




























    string_placeholder = ["PLACEHOLDER"] * n_columns
    zero_column = [0 if randint.rvs(0, 30) > 0 else  (None if randint.rvs(0, 2) > 0 else np.inf)
            for i in randint.rvs(0, 1, size=n_columns)]
    one_column = [1 if randint.rvs(0, 30) > 0 else  (None if randint.rvs(0, 2) > 0 else np.inf)
            for i in randint.rvs(0, 1, size=n_columns)]
    float_column = [i if randint.rvs(0, 30) > 0 else (None if randint.rvs(0, 2) > 0 else np.inf)
            for i in norm.rvs(size=n_columns)]
    zipcodes = ["01234", "12345", "23456", "34567", "43210", "54321", "65432", "76543"]
    zipcode_column = [i for i in islice(cycle(zipcodes), None, n_columns)]
    core_day_set_names = ["cooling_2012", "heating_2012-2013", "cooling_2013"]
    core_day_set_name_column = [i for i in islice(cycle(core_day_set_names), None, n_columns)]

    data = {
        'sw_version': string_placeholder,

        'ct_identifier': string_placeholder,
        'equipment_type': string_placeholder,
        'heating_or_cooling': core_day_set_name_column,
        'station': string_placeholder,
        'zipcode': zipcode_column,
        'climate_zone': string_placeholder,

        'start_date': datetime(2011, 1, 1),
        'end_date': datetime(2012, 1, 1),
        'n_days_both_heating_and_cooling': one_column,
        'n_days_in_inputfile_date_range': one_column,
        'n_days_insufficient_data': zero_column,
        'n_core_heating_days': one_column,

        'baseline_percentile_core_cooling_comfort_temperature': float_column,
        'baseline_percentile_core_heating_comfort_temperature': float_column,
        'regional_average_baseline_cooling_comfort_temperature': float_column,
        'regional_average_baseline_heating_comfort_temperature': float_column,

        'percent_savings_baseline_percentile': float_column,
        'avoided_daily_mean_core_day_runtime_baseline_percentile': float_column,
        'avoided_total_core_day_runtime_baseline_percentile': float_column,
        'baseline_daily_mean_core_day_runtime_baseline_percentile': float_column,
        'baseline_total_core_day_runtime_baseline_percentile': float_column,
        '_daily_mean_core_day_demand_baseline_baseline_percentile': float_column,
        'percent_savings_baseline_regional': float_column,
        'avoided_daily_mean_core_day_runtime_baseline_regional': float_column,
        'avoided_total_core_day_runtime_baseline_regional': float_column,
        'baseline_daily_mean_core_day_runtime_baseline_regional': float_column,
        'baseline_total_core_day_runtime_baseline_regional': float_column,
        '_daily_mean_core_day_demand_baseline_baseline_regional': float_column,
        'mean_demand': float_column,
        'alpha': float_column,
        'tau': float_column,
        'mean_sq_err': float_column,
        'root_mean_sq_err': float_column,
        'cv_root_mean_sq_err': float_column,
        'mean_abs_err': float_column,
        'mean_abs_pct_err': float_column,

        'total_core_cooling_runtime': float_column,
        'total_core_heating_runtime': float_column,
        'total_auxiliary_heating_core_day_runtime': float_column,
        'total_emergency_heating_core_day_runtime': float_column,

        'daily_mean_core_cooling_runtime': float_column,
        'daily_mean_core_heating_runtime': float_column,

        'core_cooling_days_mean_indoor_temperature': float_column,
        'core_cooling_days_mean_outdoor_temperature': float_column,
        'core_heating_days_mean_indoor_temperature': float_column,
        'core_heating_days_mean_outdoor_temperature': float_column,
        'core_mean_indoor_temperature': float_column,
        'core_mean_outdoor_temperature': float_column,

        'rhu1_aux_duty_cycle': float_column,
        'rhu1_emg_duty_cycle': float_column,
        'rhu1_compressor_duty_cycle': float_column,

        'rhu1_00F_to_05F': float_column,
        'rhu1_05F_to_10F': float_column,
        'rhu1_10F_to_15F': float_column,
        'rhu1_15F_to_20F': float_column,
        'rhu1_20F_to_25F': float_column,
        'rhu1_25F_to_30F': float_column,
        'rhu1_30F_to_35F': float_column,
        'rhu1_35F_to_40F': float_column,
        'rhu1_40F_to_45F': float_column,
        'rhu1_45F_to_50F': float_column,
        'rhu1_50F_to_55F': float_column,
        'rhu1_55F_to_60F': float_column,

        'rhu1_less10F': float_column,
        'rhu1_10F_to_20F': float_column,
        'rhu1_20F_to_30F': float_column,
        'rhu1_30F_to_40F': float_column,
        'rhu1_40F_to_50F': float_column,
        'rhu1_50F_to_60F': float_column,

        'rhu1_00F_to_05F_aux_duty_cycle': float_column,
        'rhu1_05F_to_10F_aux_duty_cycle': float_column,
        'rhu1_10F_to_15F_aux_duty_cycle': float_column,
        'rhu1_15F_to_20F_aux_duty_cycle': float_column,
        'rhu1_20F_to_25F_aux_duty_cycle': float_column,
        'rhu1_25F_to_30F_aux_duty_cycle': float_column,
        'rhu1_30F_to_35F_aux_duty_cycle': float_column,
        'rhu1_35F_to_40F_aux_duty_cycle': float_column,
        'rhu1_40F_to_45F_aux_duty_cycle': float_column,
        'rhu1_45F_to_50F_aux_duty_cycle': float_column,
        'rhu1_50F_to_55F_aux_duty_cycle': float_column,
        'rhu1_55F_to_60F_aux_duty_cycle': float_column,

        'rhu1_less10F_aux_duty_cycle': float_column,
        'rhu1_10F_to_20F_aux_duty_cycle': float_column,
        'rhu1_20F_to_30F_aux_duty_cycle': float_column,
        'rhu1_30F_to_40F_aux_duty_cycle': float_column,
        'rhu1_40F_to_50F_aux_duty_cycle': float_column,
        'rhu1_50F_to_60F_aux_duty_cycle': float_column,

        'rhu1_00F_to_05F_emg_duty_cycle': float_column,
        'rhu1_05F_to_10F_emg_duty_cycle': float_column,
        'rhu1_10F_to_15F_emg_duty_cycle': float_column,
        'rhu1_15F_to_20F_emg_duty_cycle': float_column,
        'rhu1_20F_to_25F_emg_duty_cycle': float_column,
        'rhu1_25F_to_30F_emg_duty_cycle': float_column,
        'rhu1_30F_to_35F_emg_duty_cycle': float_column,
        'rhu1_35F_to_40F_emg_duty_cycle': float_column,
        'rhu1_40F_to_45F_emg_duty_cycle': float_column,
        'rhu1_45F_to_50F_emg_duty_cycle': float_column,
        'rhu1_50F_to_55F_emg_duty_cycle': float_column,
        'rhu1_55F_to_60F_emg_duty_cycle': float_column,

        'rhu1_less10F_emg_duty_cycle': float_column,
        'rhu1_10F_to_20F_emg_duty_cycle': float_column,
        'rhu1_20F_to_30F_emg_duty_cycle': float_column,
        'rhu1_30F_to_40F_emg_duty_cycle': float_column,
        'rhu1_40F_to_50F_emg_duty_cycle': float_column,
        'rhu1_50F_to_60F_emg_duty_cycle': float_column,

        'rhu1_00F_to_05F_compressor_duty_cycle': float_column,
        'rhu1_05F_to_10F_compressor_duty_cycle': float_column,
        'rhu1_10F_to_15F_compressor_duty_cycle': float_column,
        'rhu1_15F_to_20F_compressor_duty_cycle': float_column,
        'rhu1_20F_to_25F_compressor_duty_cycle': float_column,
        'rhu1_25F_to_30F_compressor_duty_cycle': float_column,
        'rhu1_30F_to_35F_compressor_duty_cycle': float_column,
        'rhu1_35F_to_40F_compressor_duty_cycle': float_column,
        'rhu1_40F_to_45F_compressor_duty_cycle': float_column,
        'rhu1_45F_to_50F_compressor_duty_cycle': float_column,
        'rhu1_50F_to_55F_compressor_duty_cycle': float_column,
        'rhu1_55F_to_60F_compressor_duty_cycle': float_column,

        'rhu1_less10F_compressor_duty_cycle': float_column,
        'rhu1_10F_to_20F_compressor_duty_cycle': float_column,
        'rhu1_20F_to_30F_compressor_duty_cycle': float_column,
        'rhu1_30F_to_40F_compressor_duty_cycle': float_column,
        'rhu1_40F_to_50F_compressor_duty_cycle': float_column,
        'rhu1_50F_to_60F_compressor_duty_cycle': float_column,

        'rhu2_aux_duty_cycle': float_column,
        'rhu2_emg_duty_cycle': float_column,
        'rhu2_compressor_duty_cycle': float_column,

        'rhu2_00F_to_05F': float_column,
        'rhu2_05F_to_10F': float_column,
        'rhu2_10F_to_15F': float_column,
        'rhu2_15F_to_20F': float_column,
        'rhu2_20F_to_25F': float_column,
        'rhu2_25F_to_30F': float_column,
        'rhu2_30F_to_35F': float_column,
        'rhu2_35F_to_40F': float_column,
        'rhu2_40F_to_45F': float_column,
        'rhu2_45F_to_50F': float_column,
        'rhu2_50F_to_55F': float_column,
        'rhu2_55F_to_60F': float_column,

        'rhu2_less10F': float_column,
        'rhu2_10F_to_20F': float_column,
        'rhu2_20F_to_30F': float_column,
        'rhu2_30F_to_40F': float_column,
        'rhu2_40F_to_50F': float_column,
        'rhu2_50F_to_60F': float_column,

        'rhu2_00F_to_05F_aux_duty_cycle': float_column,
        'rhu2_05F_to_10F_aux_duty_cycle': float_column,
        'rhu2_10F_to_15F_aux_duty_cycle': float_column,
        'rhu2_15F_to_20F_aux_duty_cycle': float_column,
        'rhu2_20F_to_25F_aux_duty_cycle': float_column,
        'rhu2_25F_to_30F_aux_duty_cycle': float_column,
        'rhu2_30F_to_35F_aux_duty_cycle': float_column,
        'rhu2_35F_to_40F_aux_duty_cycle': float_column,
        'rhu2_40F_to_45F_aux_duty_cycle': float_column,
        'rhu2_45F_to_50F_aux_duty_cycle': float_column,
        'rhu2_50F_to_55F_aux_duty_cycle': float_column,
        'rhu2_55F_to_60F_aux_duty_cycle': float_column,

        'rhu2_less10F_aux_duty_cycle': float_column,
        'rhu2_10F_to_20F_aux_duty_cycle': float_column,
        'rhu2_20F_to_30F_aux_duty_cycle': float_column,
        'rhu2_30F_to_40F_aux_duty_cycle': float_column,
        'rhu2_40F_to_50F_aux_duty_cycle': float_column,
        'rhu2_50F_to_60F_aux_duty_cycle': float_column,

        'rhu2_00F_to_05F_emg_duty_cycle': float_column,
        'rhu2_05F_to_10F_emg_duty_cycle': float_column,
        'rhu2_10F_to_15F_emg_duty_cycle': float_column,
        'rhu2_15F_to_20F_emg_duty_cycle': float_column,
        'rhu2_20F_to_25F_emg_duty_cycle': float_column,
        'rhu2_25F_to_30F_emg_duty_cycle': float_column,
        'rhu2_30F_to_35F_emg_duty_cycle': float_column,
        'rhu2_35F_to_40F_emg_duty_cycle': float_column,
        'rhu2_40F_to_45F_emg_duty_cycle': float_column,
        'rhu2_45F_to_50F_emg_duty_cycle': float_column,
        'rhu2_50F_to_55F_emg_duty_cycle': float_column,
        'rhu2_55F_to_60F_emg_duty_cycle': float_column,

        'rhu2_less10F_emg_duty_cycle': float_column,
        'rhu2_10F_to_20F_emg_duty_cycle': float_column,
        'rhu2_20F_to_30F_emg_duty_cycle': float_column,
        'rhu2_30F_to_40F_emg_duty_cycle': float_column,
        'rhu2_40F_to_50F_emg_duty_cycle': float_column,
        'rhu2_50F_to_60F_emg_duty_cycle': float_column,

        'rhu2_00F_to_05F_compressor_duty_cycle': float_column,
        'rhu2_05F_to_10F_compressor_duty_cycle': float_column,
        'rhu2_10F_to_15F_compressor_duty_cycle': float_column,
        'rhu2_15F_to_20F_compressor_duty_cycle': float_column,
        'rhu2_20F_to_25F_compressor_duty_cycle': float_column,
        'rhu2_25F_to_30F_compressor_duty_cycle': float_column,
        'rhu2_30F_to_35F_compressor_duty_cycle': float_column,
        'rhu2_35F_to_40F_compressor_duty_cycle': float_column,
        'rhu2_40F_to_45F_compressor_duty_cycle': float_column,
        'rhu2_45F_to_50F_compressor_duty_cycle': float_column,
        'rhu2_50F_to_55F_compressor_duty_cycle': float_column,
        'rhu2_55F_to_60F_compressor_duty_cycle': float_column,

        'rhu2_less10F_compressor_duty_cycle': float_column,
        'rhu2_10F_to_20F_compressor_duty_cycle': float_column,
        'rhu2_20F_to_30F_compressor_duty_cycle': float_column,
        'rhu2_30F_to_40F_compressor_duty_cycle': float_column,
        'rhu2_40F_to_50F_compressor_duty_cycle': float_column,
        'rhu2_50F_to_60F_compressor_duty_cycle': float_column,
    df = pd.DataFrame(data, columns=columns)
    return df
Exemplo n.º 29
 def _fit(self, X, y):
     from scipy.stats import randint
     randidx = randint.rvs(0, len(y), size=10)
     counts = np.bincount(randidx)
     self.majority_ = np.argmax(counts)
Exemplo n.º 30
    def test_nchypergeom_wallenius_naive(self):
        # test against a very simple implementation

        shape = (2, 4, 3)
        max_m = 100
        m1 = np.random.randint(1, max_m, size=shape)
        m2 = np.random.randint(1, max_m, size=shape)
        N = m1 + m2
        n = randint.rvs(0, N, size=N.shape)
        xl = np.maximum(0, n - m2)
        xu = np.minimum(n, m1)
        x = randint.rvs(xl, xu, size=xl.shape)
        w = np.random.rand(*x.shape) * 2

        def support(N, m1, n, w):
            m2 = N - m1
            xl = np.maximum(0, n - m2)
            xu = np.minimum(n, m1)
            return xl, xu

        def mean(N, m1, n, w):
            m2 = N - m1
            xl, xu = support(N, m1, n, w)

            def fun(u):
                return u / m1 + (1 - (n - u) / m2)**w - 1

            return root_scalar(fun, bracket=(xl, xu)).root

        assert_allclose(nchypergeom_wallenius.mean(N, m1, n, w),
                        mean(N, m1, n, w),

        def variance(N, m1, n, w):
            m2 = N - m1
            u = mean(N, m1, n, w)
            a = u * (m1 - u)
            b = (n - u) * (u + m2 - n)
            return N * a * b / ((N - 1) * (m1 * b + m2 * a))

        assert_allclose(nchypergeom_wallenius.stats(N, m1, n, w, moments='v'),
                        variance(N, m1, n, w),

        def pmf(x, N, m1, n, w):
            m2 = N - m1
            xl, xu = support(N, m1, n, w)

            def integrand(t):
                D = w * (m1 - x) + (m2 - (n - x))
                res = (1 - t**(w / D))**x * (1 - t**(1 / D))**(n - x)
                return res

            def f(x):
                t1 = special_binom(m1, x)
                t2 = special_binom(m2, n - x)
                the_integral = quad(integrand,
                return t1 * t2 * the_integral[0]

            return f(x)

        pmf0 = pmf(x, N, m1, n, w)
        pmf1 = nchypergeom_wallenius.pmf(x, N, m1, n, w)

        atol, rtol = 1e-6, 1e-6
        i = np.abs(pmf1 - pmf0) < atol + rtol * np.abs(pmf0)
        assert (i.sum() > np.prod(shape) / 2)  # works at least half the time

        # for those that fail, discredit the naive implementation
        for N, m1, n, w in zip(N[~i], m1[~i], n[~i], w[~i]):
            # get the support
            m2 = N - m1
            xl, xu = support(N, m1, n, w)
            x = np.arange(xl, xu + 1)

            # calculate sum of pmf over the support
            # the naive implementation is very wrong in these cases
            assert pmf(x, N, m1, n, w).sum() < .5
            assert_allclose(nchypergeom_wallenius.pmf(x, N, m1, n, w).sum(), 1)
Exemplo n.º 31
class TestNCH():
    np.random.seed(2)  # seeds 0 and 1 had some xl = xu; randint failed
    shape = (2, 4, 3)
    max_m = 100
    m1 = np.random.randint(1, max_m, size=shape)  # red balls
    m2 = np.random.randint(1, max_m, size=shape)  # white balls
    N = m1 + m2  # total balls
    n = randint.rvs(0, N, size=N.shape)  # number of draws
    xl = np.maximum(0, n - m2)  # lower bound of support
    xu = np.minimum(n, m1)  # upper bound of support
    x = randint.rvs(xl, xu, size=xl.shape)
    odds = np.random.rand(*x.shape) * 2

    # test output is more readable when function names (strings) are passed
                             ['nchypergeom_fisher', 'nchypergeom_wallenius'])
    def test_nch_hypergeom(self, dist_name):
        # Both noncentral hypergeometric distributions reduce to the
        # hypergeometric distribution when odds = 1
        dists = {
            'nchypergeom_fisher': nchypergeom_fisher,
            'nchypergeom_wallenius': nchypergeom_wallenius
        dist = dists[dist_name]
        x, N, m1, n = self.x, self.N, self.m1, self.n
        assert_allclose(dist.pmf(x, N, m1, n, odds=1),
                        hypergeom.pmf(x, N, m1, n))

    def test_nchypergeom_fisher_naive(self):
        # test against a very simple implementation
        x, N, m1, n, odds = self.x, self.N, self.m1, self.n, self.odds

        def pmf_mean_var(x, N, m1, n, w):
            # simple implementation of nchypergeom_fisher pmf
            m2 = N - m1
            xl = np.maximum(0, n - m2)
            xu = np.minimum(n, m1)

            def f(x):
                t1 = special_binom(m1, x)
                t2 = special_binom(m2, n - x)
                return t1 * t2 * w**x

            def P(k):
                return sum((f(y) * y**k for y in range(xl, xu + 1)))

            P0 = P(0)
            P1 = P(1)
            P2 = P(2)
            pmf = f(x) / P0
            mean = P1 / P0
            var = P2 / P0 - (P1 / P0)**2
            return pmf, mean, var

        pmf, mean, var = pmf_mean_var(x, N, m1, n, odds)
        assert_allclose(nchypergeom_fisher.pmf(x, N, m1, n, odds), pmf)
        assert_allclose(nchypergeom_fisher.stats(N, m1, n, odds, moments='m'),
        assert_allclose(nchypergeom_fisher.stats(N, m1, n, odds, moments='v'),

    def test_nchypergeom_wallenius_naive(self):
        # test against a very simple implementation

        shape = (2, 4, 3)
        max_m = 100
        m1 = np.random.randint(1, max_m, size=shape)
        m2 = np.random.randint(1, max_m, size=shape)
        N = m1 + m2
        n = randint.rvs(0, N, size=N.shape)
        xl = np.maximum(0, n - m2)
        xu = np.minimum(n, m1)
        x = randint.rvs(xl, xu, size=xl.shape)
        w = np.random.rand(*x.shape) * 2

        def support(N, m1, n, w):
            m2 = N - m1
            xl = np.maximum(0, n - m2)
            xu = np.minimum(n, m1)
            return xl, xu

        def mean(N, m1, n, w):
            m2 = N - m1
            xl, xu = support(N, m1, n, w)

            def fun(u):
                return u / m1 + (1 - (n - u) / m2)**w - 1

            return root_scalar(fun, bracket=(xl, xu)).root

        assert_allclose(nchypergeom_wallenius.mean(N, m1, n, w),
                        mean(N, m1, n, w),

        def variance(N, m1, n, w):
            m2 = N - m1
            u = mean(N, m1, n, w)
            a = u * (m1 - u)
            b = (n - u) * (u + m2 - n)
            return N * a * b / ((N - 1) * (m1 * b + m2 * a))

        assert_allclose(nchypergeom_wallenius.stats(N, m1, n, w, moments='v'),
                        variance(N, m1, n, w),

        def pmf(x, N, m1, n, w):
            m2 = N - m1
            xl, xu = support(N, m1, n, w)

            def integrand(t):
                D = w * (m1 - x) + (m2 - (n - x))
                res = (1 - t**(w / D))**x * (1 - t**(1 / D))**(n - x)
                return res

            def f(x):
                t1 = special_binom(m1, x)
                t2 = special_binom(m2, n - x)
                the_integral = quad(integrand,
                return t1 * t2 * the_integral[0]

            return f(x)

        pmf0 = pmf(x, N, m1, n, w)
        pmf1 = nchypergeom_wallenius.pmf(x, N, m1, n, w)

        atol, rtol = 1e-6, 1e-6
        i = np.abs(pmf1 - pmf0) < atol + rtol * np.abs(pmf0)
        assert (i.sum() > np.prod(shape) / 2)  # works at least half the time

        # for those that fail, discredit the naive implementation
        for N, m1, n, w in zip(N[~i], m1[~i], n[~i], w[~i]):
            # get the support
            m2 = N - m1
            xl, xu = support(N, m1, n, w)
            x = np.arange(xl, xu + 1)

            # calculate sum of pmf over the support
            # the naive implementation is very wrong in these cases
            assert pmf(x, N, m1, n, w).sum() < .5
            assert_allclose(nchypergeom_wallenius.pmf(x, N, m1, n, w).sum(), 1)

    def test_wallenius_against_mpmath(self):
        # precompute data with mpmath since naive implementation above
        # is not reliable. See source code in gh-13330.
        M = 50
        n = 30
        N = 20
        odds = 2.25
        # Expected results, computed with mpmath.
        sup = np.arange(21)
        pmf = np.array([
            3.699003068656875e-20, 5.89398584245431e-17,
            2.1594437742911123e-14, 3.221458044649955e-12,
            2.4658279241205077e-10, 1.0965862603981212e-08,
            3.057890479665704e-07, 5.622818831643761e-06,
            7.056482841531681e-05, 0.000618899425358671, 0.003854172932571669,
            0.01720592676256026, 0.05528844897093792, 0.12772363313574242,
            0.21065898367825722, 0.24465958845359234, 0.1955114898110033,
            0.10355390084949237, 0.03414490375225675, 0.006231989845775931,
        mean = 14.808018384813426
        var = 2.6085975877923717

        # nchypergeom_wallenius.pmf returns 0 for pmf(0) and pmf(1), and pmf(2)
        # has only three digits of accuracy (~ 2.1511e-14).
        assert_allclose(nchypergeom_wallenius.pmf(sup, M, n, N, odds),
        assert_allclose(nchypergeom_wallenius.mean(M, n, N, odds),
        assert_allclose(nchypergeom_wallenius.var(M, n, N, odds),

                             ['nchypergeom_fisher', 'nchypergeom_wallenius'])
    def test_rvs_shape(self, dist_name):
        # Check that when given a size with more dimensions than the
        # dimensions of the broadcast parameters, rvs returns an array
        # with the correct shape.
        dists = {
            'nchypergeom_fisher': nchypergeom_fisher,
            'nchypergeom_wallenius': nchypergeom_wallenius
        dist = dists[dist_name]
        x = dist.rvs(50, 30, [[10], [20]], [0.5, 1.0, 2.0], size=(5, 1, 2, 3))
        assert x.shape == (5, 1, 2, 3)
Exemplo n.º 32
from scipy.stats import randint

def plot_1D_function(x, y, y_name='y'):
        ax = plt.subplot(111)
        ax.plot(x, y, y_name)

a_1 = np.linspace(0,10,100)
a_2 = np.linspace(0,10,100)
b_1 = np.linspace(0,10,100)
b_2 = np.linspace(0,10,100)
pi = np.linspace(0,1,10)
input_space = np.linspace(0,1,1000)
for i in range(5):
	pi_rvs = randint.rvs(0,10)
	a_1_rvs = randint.rvs(0,100)
	a_2_rvs = randint.rvs(0,100)
	b_1_rvs = randint.rvs(0,100)
	b_2_rvs = randint.rvs(0,100)
	bibeta_example_pdf = pi[pi_rvs]*beta.pdf(input_space,a_1[a_1_rvs],b_1[b_1_rvs]) + (1-pi[pi_rvs])*beta.pdf(input_space,a_2[a_2_rvs],b_2[b_2_rvs])
	bibeta_example_cdf = pi[pi_rvs]*beta.cdf(input_space,a_1[a_1_rvs],b_1[b_1_rvs]) + (1-pi[pi_rvs])*beta.cdf(input_space,a_2[a_2_rvs],b_2[b_2_rvs])
	ax = plt.subplot(111)
        ax.plot(input_space, pi[pi_rvs]*beta.pdf(input_space,a_1[a_1_rvs],b_1[b_1_rvs]), label="1 comp pdf")
        ax.plot(input_space, (1-pi[pi_rvs])*beta.pdf(input_space,a_2[a_2_rvs],b_2[b_2_rvs]), label="2 comp pdf")
        ax.plot(input_space, pi[pi_rvs]*beta.pdf(input_space,a_1[a_1_rvs],b_1[b_1_rvs]) + (1-pi[pi_rvs])*beta.pdf(input_space,a_2[a_2_rvs],b_2[b_2_rvs]), label="Mix pdf")
	ax = plt.subplot(111)
	ax.plot(input_space, pi[pi_rvs]*beta.cdf(input_space,a_1[a_1_rvs],b_1[b_1_rvs]), label="1 comp cdf")
	ax.plot(input_space, (1-pi[pi_rvs])*beta.cdf(input_space,a_2[a_2_rvs],b_2[b_2_rvs]), label="2 comp cdf")
Exemplo n.º 33
def get_fake_output_df(n_columns):
    columns = [









    string_placeholder = ["PLACEHOLDER"] * n_columns
    zero_column = [0 if randint.rvs(0, 30) > 0 else  (None if randint.rvs(0, 2) > 0 else np.inf)
                  for i in randint.rvs(0, 1, size=n_columns)]
    one_column = [1 if randint.rvs(0, 30) > 0 else  (None if randint.rvs(0, 2) > 0 else np.inf)
                  for i in randint.rvs(0, 1, size=n_columns)]
    float_column = [i if randint.rvs(0, 30) > 0 else (None if randint.rvs(0, 2) > 0 else np.inf)
                    for i in norm.rvs(size=n_columns)]
    zipcodes = ["01234", "12345", "23456", "34567", "43210", "54321", "65432", "76543"]
    zipcode_column = [i for i in islice(cycle(zipcodes), None, n_columns)]
    core_day_set_names = ["cooling_2012", "heating_2012-2013", "cooling_2013"]
    core_day_set_name_column = [i for i in islice(cycle(core_day_set_names), None, n_columns)]

    data = {
        'sw_version': string_placeholder,

        'ct_identifier': string_placeholder,
        'equipment_type': string_placeholder,
        'heating_or_cooling': core_day_set_name_column,
        'station': string_placeholder,
        'zipcode': zipcode_column,
        'climate_zone': string_placeholder,

        'start_date': datetime(2011, 1, 1),
        'end_date': datetime(2012, 1, 1),
        'n_days_both_heating_and_cooling': one_column,
        'n_days_in_inputfile_date_range': one_column,
        'n_days_insufficient_data': zero_column,
        'n_core_heating_days': one_column,

        'baseline_percentile_core_cooling_comfort_temperature': float_column,
        'baseline_percentile_core_heating_comfort_temperature': float_column,
        'regional_average_baseline_cooling_comfort_temperature': float_column,
        'regional_average_baseline_heating_comfort_temperature': float_column,

        'percent_savings_baseline_percentile': float_column,
        'avoided_daily_mean_core_day_runtime_baseline_percentile': float_column,
        'avoided_total_core_day_runtime_baseline_percentile': float_column,
        'baseline_daily_mean_core_day_runtime_baseline_percentile': float_column,
        'baseline_total_core_day_runtime_baseline_percentile': float_column,
        '_daily_mean_core_day_demand_baseline_baseline_percentile': float_column,
        'percent_savings_baseline_regional': float_column,
        'avoided_daily_mean_core_day_runtime_baseline_regional': float_column,
        'avoided_total_core_day_runtime_baseline_regional': float_column,
        'baseline_daily_mean_core_day_runtime_baseline_regional': float_column,
        'baseline_total_core_day_runtime_baseline_regional': float_column,
        '_daily_mean_core_day_demand_baseline_baseline_regional': float_column,
        'mean_demand': float_column,
        'alpha': float_column,
        'tau': float_column,
        'mean_sq_err': float_column,
        'root_mean_sq_err': float_column,
        'cv_root_mean_sq_err': float_column,
        'mean_abs_err': float_column,
        'mean_abs_pct_err': float_column,

        'total_core_cooling_runtime': float_column,
        'total_core_heating_runtime': float_column,
        'total_auxiliary_heating_core_day_runtime': float_column,
        'total_emergency_heating_core_day_runtime': float_column,

        'daily_mean_core_cooling_runtime': float_column,
        'daily_mean_core_heating_runtime': float_column,

        'rhu_00F_to_05F': float_column,
        'rhu_05F_to_10F': float_column,
        'rhu_10F_to_15F': float_column,
        'rhu_15F_to_20F': float_column,
        'rhu_20F_to_25F': float_column,
        'rhu_25F_to_30F': float_column,
        'rhu_30F_to_35F': float_column,
        'rhu_35F_to_40F': float_column,
        'rhu_40F_to_45F': float_column,
        'rhu_45F_to_50F': float_column,
        'rhu_50F_to_55F': float_column,
        'rhu_55F_to_60F': float_column,
    df = pd.DataFrame(data, columns=columns)
    return df
Exemplo n.º 34
def get_fake_output_df(n_columns):
    columns = [



















    string_placeholder = ["PLACEHOLDER"] * n_columns
    zero_column = [0 if randint.rvs(0, 30) > 0 else  (None if randint.rvs(0, 2) > 0 else np.inf)
                  for i in randint.rvs(0, 1, size=n_columns)]
    one_column = [1 if randint.rvs(0, 30) > 0 else  (None if randint.rvs(0, 2) > 0 else np.inf)
                  for i in randint.rvs(0, 1, size=n_columns)]
    float_column = [i if randint.rvs(0, 30) > 0 else (None if randint.rvs(0, 2) > 0 else np.inf)
                    for i in norm.rvs(size=n_columns)]
    zipcodes = ["01234", "12345", "23456", "34567", "43210", "54321", "65432", "76543"]
    zipcode_column = [i for i in islice(cycle(zipcodes), None, n_columns)]
    season_names = ["Cooling 2012", "Heating 2012-2013", "Cooling 2013"]
    season_name_column = [i for i in islice(cycle(season_names), None, n_columns)]
    data = {
        "ct_identifier": string_placeholder,
        "equipment_type": string_placeholder,
        "season_name": season_name_column,
        "station": string_placeholder,
        "zipcode": zipcode_column,

        "n_days_both_heating_and_cooling": one_column,
        "n_days_in_season": one_column,
        "n_days_in_season_range": one_column,
        "n_days_insufficient_data": zero_column,

        "seasonal_savings_deltaT": float_column,
        "seasonal_savings_dailyavgCDD": float_column,
        "seasonal_savings_dailyavgHDD": float_column,
        "seasonal_savings_hourlyavgCDD": float_column,
        "seasonal_savings_hourlyavgHDD": float_column,

        "seasonal_avoided_runtime_deltaT": float_column,
        "seasonal_avoided_runtime_dailyavgCDD": float_column,
        "seasonal_avoided_runtime_dailyavgHDD": float_column,
        "seasonal_avoided_runtime_hourlyavgCDD": float_column,
        "seasonal_avoided_runtime_hourlyavgHDD": float_column,

        "total_heating_runtime": float_column,
        "total_cooling_runtime": float_column,
        "total_auxiliary_heating_runtime": float_column,
        "total_emergency_heating_runtime": float_column,

        "actual_daily_runtime": float_column,
        "actual_seasonal_runtime": float_column,

        "baseline_comfort_temperature": float_column,

        "baseline_daily_runtime_deltaT": float_column,
        "baseline_daily_runtime_dailyavgCDD": float_column,
        "baseline_daily_runtime_dailyavgHDD": float_column,
        "baseline_daily_runtime_hourlyavgCDD": float_column,
        "baseline_daily_runtime_hourlyavgHDD": float_column,

        "baseline_seasonal_runtime_deltaT": float_column,
        "baseline_seasonal_runtime_dailyavgCDD": float_column,
        "baseline_seasonal_runtime_dailyavgHDD": float_column,
        "baseline_seasonal_runtime_hourlyavgCDD": float_column,
        "baseline_seasonal_runtime_hourlyavgHDD": float_column,

        "mean_demand_dailyavgCDD": float_column,
        "mean_demand_dailyavgHDD": float_column,
        "mean_demand_deltaT": float_column,
        "mean_demand_hourlyavgCDD": float_column,
        "mean_demand_hourlyavgHDD": float_column,

        "mean_demand_baseline_dailyavgCDD": float_column,
        "mean_demand_baseline_dailyavgHDD": float_column,
        "mean_demand_baseline_deltaT": float_column,
        "mean_demand_baseline_hourlyavgCDD": float_column,
        "mean_demand_baseline_hourlyavgHDD": float_column,

        "rhu_00F_to_05F": float_column,
        "rhu_05F_to_10F": float_column,
        "rhu_10F_to_15F": float_column,
        "rhu_15F_to_20F": float_column,
        "rhu_20F_to_25F": float_column,
        "rhu_25F_to_30F": float_column,
        "rhu_30F_to_35F": float_column,
        "rhu_35F_to_40F": float_column,
        "rhu_40F_to_45F": float_column,
        "rhu_45F_to_50F": float_column,
        "rhu_50F_to_55F": float_column,
        "rhu_55F_to_60F": float_column,

        "slope_deltaT": float_column,
        "alpha_est_dailyavgCDD": float_column,
        "alpha_est_dailyavgHDD": float_column,
        "alpha_est_hourlyavgCDD": float_column,
        "alpha_est_hourlyavgHDD": float_column,

        "intercept_deltaT": float_column,
        "deltaT_base_est_dailyavgCDD": float_column,
        "deltaT_base_est_dailyavgHDD": float_column,
        "deltaT_base_est_hourlyavgCDD": float_column,
        "deltaT_base_est_hourlyavgHDD": float_column,

        "mean_sq_err_dailyavgCDD": float_column,
        "mean_sq_err_dailyavgHDD": float_column,
        "mean_sq_err_deltaT": float_column,
        "mean_sq_err_hourlyavgCDD": float_column,
        "mean_sq_err_hourlyavgHDD": float_column,

        "root_mean_sq_err_dailyavgCDD": float_column,
        "root_mean_sq_err_dailyavgHDD": float_column,
        "root_mean_sq_err_deltaT": float_column,
        "root_mean_sq_err_hourlyavgCDD": float_column,
        "root_mean_sq_err_hourlyavgHDD": float_column,

        "cv_root_mean_sq_err_dailyavgCDD": float_column,
        "cv_root_mean_sq_err_dailyavgHDD": float_column,
        "cv_root_mean_sq_err_deltaT": float_column,
        "cv_root_mean_sq_err_hourlyavgCDD": float_column,
        "cv_root_mean_sq_err_hourlyavgHDD": float_column,

        "mean_abs_err_dailyavgCDD": float_column,
        "mean_abs_err_dailyavgHDD": float_column,
        "mean_abs_err_deltaT": float_column,
        "mean_abs_err_hourlyavgCDD": float_column,
        "mean_abs_err_hourlyavgHDD": float_column,

        "mean_abs_pct_err_dailyavgCDD": float_column,
        "mean_abs_pct_err_dailyavgHDD": float_column,
        "mean_abs_pct_err_deltaT": float_column,
        "mean_abs_pct_err_hourlyavgCDD": float_column,
        "mean_abs_pct_err_hourlyavgHDD": float_column,

    df = pd.DataFrame(data, columns=columns)
    return df
Exemplo n.º 35
import matplotlib.pyplot as plt

from scipy.stats import randint
import numpy as np

# normal
distribution = scipy.stats.norm(loc=100, scale=5)
print distribution.stats("mvsk")
# skewed
distribution = scipy.stats.gengamma(100, 90, loc=50, scale=10)
print distribution.stats("mvsk")

sample = distribution.rvs(size=10000)

sample = randint.rvs(0, 208, size=1000)

pers = np.arange(1, 101, 1)

# Make each of the last 41 elements 5x more likely
prob = [1.0] * (len(pers) - 41) + [5.0] * 41

# Normalising to 1.0
prob /= np.sum(prob)

sample = np.random.choice(pers, 1000, p=prob)

Exemplo n.º 36
def z_enrichment_test(node_pvals_dict, the_grouping_dict, **kwargs):
    """ Perform enrichment analysis on the groupings in the_grouping_dict, using
     statistical aggregation based on the z-scores.  Note this is similar to
     the statistical subnetwork scoring system used in:

     Ideker, T., Ozier, O., Schwikowski, B., & Siegel, A. (2002). 
     Discovering regulatory and signalling circuits in molecular interaction networks. 
     Bioinformatics, 18, 233–240.

     Except here we use the pre-defined groupings in the_grouping_dict rather than
     scanning for novel subnetworks.

     node_pvals_dict: must have a subdictionary of node_id:
      'p_uncorrected': (or 'p').  Note the p-value should be from
                      a two-tailed test for changes.
      't': optional, to deal with two-tailedness if doing signed z aggregation)
      p-values are assumed to result from two-tail tests and span [0 - 1].
      'z' is optional, if 'use_type' = z then it is needed
     the_grouping_dict: a dict of subsystem_id: [node_id_1, node_id_2, ...] 

     navg_node_sample: default is 100.0.  The groupings are randomized 
      such that each gene is sampled an
      average of 100 times. 
     diagnostic: [False (default), True]: if True, a list of the randomly 
                 generated p-values will also be returned.
     aggregation_type: options are 'signed' or 'unsigned'
      'unsigned': the z-value ranges from 0 to + inf.  This method 
                  picks out gross changes in subsystems and ignored 
                  whether they are increasing or decreasing.
      'signed': the z-value ranges from -inf to +inf.  This method picks out
                coordinated changes in subsystems.

     grouping_scores_dict with keys
      agg_z = the aggregated z-value without background correction
      agg_adj_z = the aggregated z-value with background correction
      agg_p = a p-value resulting from a two-tail test for changes 
       (e.g. near 0 is more significant) assuming the agg_adj_z is truly
       normally distributed.
     if diagnostic = True, random_scores_to_return is also returned.

    from numpy import nan, sign, mean, std, array, inf, zeros
    from numpy.random import rand
    from copy import deepcopy
    from random import sample, shuffle
    from scipy.stats import norm, randint
    diagnostic = test_kwarg('diagnostic', kwargs, [False, True])
    aggregation_type = test_kwarg('aggregation_type', kwargs, ['unsigned', 'signed'])

    if 'navg_node_sample' in kwargs: 
        navg_node_sample = kwargs['navg_node_sample']
        navg_node_sample = 100.0

    grouping_scores_dict = {}
    node_pvals_dictl = deepcopy(node_pvals_dict)
    random_scores_to_return = {}
    # First get the subsystems
    for subsystem in the_grouping_dict.keys():
        # Enforce lower case to avoid duplication
        subsystem = subsystem.lower()
        if not(grouping_scores_dict.has_key(subsystem)):
            grouping_scores_dict[subsystem] = {}
            grouping_scores_dict[subsystem]['ind_p'] = []
            if aggregation_type == 'signed':
                grouping_scores_dict[subsystem]['ind_t'] = []
            grouping_scores_dict[subsystem]['agg_p'] = nan
            grouping_scores_dict[subsystem]['agg_z'] = nan
            grouping_scores_dict[subsystem]['agg_adj_z'] = nan
            grouping_scores_dict[subsystem]['ind_node'] = []
    # Now add in pvals
    # Give preference to uncorrected p-values since Bonferroni corrected values
    # are truncated at 1.  Don't need a multiple testing correction
    # since we are looking across the subsystem and correcting
    # would potentially lose information in detected differences.
    # We re-normalize p-values for subnetworks according to an 
    # empirical null distribution at the end.            
    test_node = node_pvals_dictl.keys()[0]
    if node_pvals_dictl[test_node].has_key('p_uncorrected'):
        p_key = 'p_uncorrected'
        p_key = 'p'

    for subsystem in the_grouping_dict.keys():
        subsystem_lower = subsystem.lower()
        for the_node in the_grouping_dict[subsystem]:
            if the_node in node_pvals_dictl.keys():
            if aggregation_type == 'signed':

    # Now aggregate.  Make a lookuptable of size vs p values
    maxk = 0
    for subsystem in grouping_scores_dict:
        if len(grouping_scores_dict[subsystem]['ind_p']) > maxk:
            maxk = len(grouping_scores_dict[subsystem]['ind_p'])
    meanlookup = []
    sdlookup = []
    # Haven't pressure tested the window with even numbers, 
    # but odd values make more sense anyway.
    windowsize = 5
    maxsize =  int(maxk + 1 + int(round((windowsize-1)/2)))
    # It can be slow to compute system statistics, 
    # so we need to be selective and just evaluate
    # around the sample sizes of interest
    k_to_evaluate = []
    for subsystem in grouping_scores_dict:
        k = len(grouping_scores_dict[subsystem]['ind_p'])
        if k > 0:
            k = list(range(max(1,(k-(windowsize-1)/2)),(k+((windowsize-1)/2)+1)))
            k_to_evaluate = list(set(k_to_evaluate))

    # to speed calculations pre-convert to a z-score
    node_list = node_pvals_dictl.keys()
    pval_list = [node_pvals_dictl[curnode][p_key] for curnode in node_list]

    # Want to replace 0 or 1 pvals, 
    # use the next nearest value
    filter_pval_list = [x for x in pval_list if ((x > 0) & (x < 1))]
    min_val = min(filter_pval_list)
    max_val = min(filter_pval_list)
    for i, x in enumerate(pval_list):
        if x >= 1:
            pval_list[i] = max_val
        if x <= 0:
            pval_list[i] = min_val

    if aggregation_type == 'signed':
        # Here, we aggregate using p-values
        # resulting from one-tailed tests
        # where p = 0.5 means no change and
        # decreases in expression imply negative z
        # when aggregating.
        # Convert our z first then take the sign
        # to minimize numerical issues.
        # This method is equivalent to
        # Stouffer's method.
        print "Warning, verify assumptions for signed averaging, this has not been done in a while."
        zval_list = norm.ppf(pval_list)
        tval_list = [node_pvals_dictl[curnode]['t'] for curnode in node_list]    
        for index, t_val in enumerate(tval_list):
            if t_val > 0:
                zval_list[index] = -1 * zval_list[index]
        # Ideker 2002 and also Patil 2005 use an
        # undirected p-value when aggregating
        # Z-scores for p-values - e.g. they
        # use the "significance of the change"
        # where more negative z corresponds to 
		# p ~ 1 and little change.
        zval_list = -1 * norm.ppf(pval_list)        
    for k in k_to_evaluate:
        print('Simulating measures for subsystem number '+ str(k_to_evaluate.index(k)+1) + ' of '+ str(len(k_to_evaluate)) + '.')
        r_z_values = []
        # Should need more trials with small k.
        # Rule of thumb: set size so all model 
        # genes are sampled on the average > 7x
        # This and windowsize = 5 seem to be result in 
        # fairly stable statistics from trial-and-error.
        ntrials = int(round(navg_node_sample*float(len(node_pvals_dictl))/float(k)))

        # Generate random indices between 0 and nmeasures - 1, as an array of size ntrials rows and k columns
        the_random_indices = randint.rvs(0, len(node_list), size=(ntrials, k))
        # Faster to do this here as an array operation than call stouffer_z_agg
        random_score_distribution = array([sum(zval_list[x]) for x in list(the_random_indices)]) /(k**0.5)
        # Note SD's defined by this method are approximately size-independent
        if diagnostic:
            random_scores_to_return[k] = random_score_distribution

    # The SD as assessed here should be independent of size
    # apply a smoothing filter here first
    sdlookup = list(smooth(array(sdlookup), window_len=windowsize))

    # The mean will be dependent on k**.5; To avoid edge effects
    # of the window first normalize then apply the smoothing filter
    for k in k_to_evaluate:
        meanlookup[k_to_evaluate.index(k)] = meanlookup[k_to_evaluate.index(k)] / (k ** 0.5)

    meanlookup = list(smooth(array(meanlookup), window_len=windowsize))

    for k in k_to_evaluate:
        meanlookup[k_to_evaluate.index(k)] = meanlookup[k_to_evaluate.index(k)] * (k ** 0.5)

    # Re-normalize the mean before averaging    
    for subsystem in grouping_scores_dict:
        k = len(grouping_scores_dict[subsystem]['ind_p'])
        if k > 0:
            node_indices = [index for index, node in enumerate(node_list) if node in grouping_scores_dict[subsystem]['ind_node']]
            grouping_scores_dict[subsystem]['agg_z'] = sum([zval_list[index] for index in node_indices]) / (k**0.5)
            grouping_scores_dict[subsystem]['agg_adj_z'] = (grouping_scores_dict[subsystem]['agg_z'] - meanlookup[k_to_evaluate.index(k)] )/ sdlookup[k_to_evaluate.index(k)]
            grouping_scores_dict[subsystem]['agg_p'] = norm.cdf(grouping_scores_dict[subsystem]['agg_adj_z'])
            # Convert back to a two-sided p value, this is twice the one-sided value
            if aggregation_type == 'signed':
                if grouping_scores_dict[subsystem]['agg_p'] < .5:
                    grouping_scores_dict[subsystem]['agg_p'] = 2*grouping_scores_dict[subsystem]['agg_p']
                    grouping_scores_dict[subsystem]['agg_p'] = 2*(1-grouping_scores_dict[subsystem]['agg_p'])
                grouping_scores_dict[subsystem]['agg_p'] = 1- grouping_scores_dict[subsystem]['agg_p']
            if diagnostic:
                grouping_scores_dict[subsystem]['z'] = norm.cdf(k_to_evaluate.index(k))

    if not diagnostic:
        return grouping_scores_dict
        return grouping_scores_dict, random_scores_to_return
def distrib(param1,param2,param3,param4,egg):
	#param1 : list_element
	#param2 : config de la propriete
	#param3 : nom de la propriete
	#param4 : snapshot id
	#egg which is egg

	# qualitatif :begin

	if param2['domain']['type']=="qualitatif":

		# qualitatif sans ordre:begin

		if param2['domain']['order']=="false":


			for  succ_key in param2["evolution"]["succesors"]:

			######################### division de l'ensemble des elements en plusieurs ensembles selon la regle de succession
			for param1_element in param1:


				for succ_key in param2["evolution"]["succesors"]:


					if succ_key == value_pr :




				if bool_succ == False:


			######################### end

			#########################  affectation des valeurs a no succ elements
				random =  list(randint.rvs(0, len(param2['domain']['values']), size=len(no_succ_elements)))
				print '\n***************\n*\n*	Error: Check the configuration of property: ',param3,'\n*\n***************'
			for elementId in no_succ_elements:


			######################### end

			######################### affectation des valeurs aux elements de succ_elements

			succ_index =0

			for  succ_key in param2["evolution"]["succesors"]:

					random =  list(randint.rvs(0, len(param2['evolution']['succesors'][succ_key]), size=len(succ_list)))
					print '\n***************\n*\n*	Error: Check the configuration of property: ',param3,'\n*\n***************'
				for elementId in succ_list:



			######################### end

		# qualitatif sans ordre:fin

		# qualitatif avec ordre:begin

				for m in range(0,param2['evolution']['offset']['max']-param2['evolution']['offset']['min']+1):
					### m ne va pas jusqu au bout
					offset_list.append(param2['evolution']['offset']['min'] + m)
				print '\n***************\n*\n*	Error: Check the configuration of property: ',param3,'\n*\n***************'
			# uniform:begin
			if param2['evolution']['offset']['distribution']['type']=="uniform":

					random =  randint.rvs(0, len(offset_list), size=len(param1))
					print '\n***************\n*\n*	Error: Check the configuration of property: ',param3,'\n*\n***************'
				for param1_element in param1 :

					value_pr=egg[param1_element][param3][param4-1] # value of the previous element, needed for offset

					if len(param2['domain']['values'])-1<indice+offset_list[random[i]]:########## enter here only when indice+offset_list[random[0]] is bigger than the biggest index
						egg[param1_element][param3].insert(param4,param2['domain']['values'][len(param2['domain']['values'])-1])##### we take the last value
					elif indice+offset_list[random[i]]<0:
						egg[param1_element][param3].insert(param4,param2['domain']['values'][0]) ######### we take the first value
			# uniform:end

			# binom:begin
			if param2['evolution']['offset']['distribution']['type']=="binom":
					random =  binom.rvs(len(offset_list)-1,param2['evolution']['offset']['distribution']["p"] , size=len(param1))
					print '\n***************\n*\n*	Error: Check the configuration of property: ',param3,'\n*\n***************'
				for param1_element in param1 :

					value_pr=egg[param1_element][param3][param4-1] # value of the previous element, needed for offset

					#logging.info( param3+param1+str(param4)+str(indice)+str(offset_list)+str(random[0]))
					if len(param2['domain']['values'])-1<indice+offset_list[random[i]]:########## enter here only when indice+offset_list[random[0]] is bigger than the biggest index
						egg[param1_element][param3].insert(param4,param2['domain']['values'][len(param2['domain']['values'])-1]) ##### we take the last value
					elif indice+offset_list[random[i]]<0:
						egg[param1_element][param3].insert(param4,param2['domain']['values'][0]) ######### we take the first value

			# binom:end

		# qualitatif avec ordre:end

	# qualitatif :end

	# quantitatif:dis :begin

	if param2['domain']['type']=="quantitatif:dis":

			for m in range(0,param2['evolution']['offset']['max']-param2['evolution']['offset']['min']+1):
				### jai ajoute 1 car il yavait un bug que je ne comprenais pas , m n allait pas jusqu au plus grand nombre
				offset_list.append(param2['evolution']['offset']['min'] + m)
			print '\n***************\n*\n*	Error: Check the configuration of property: ',param3,'\n*\n***************'

			random =  binom.rvs(len(offset_list)-1,param2['evolution']['offset']['distribution']["p"] , size=len(param1))
			print '\n***************\n*\n*	Error: Check the configuration of property: ',param3,'\n*\n***************'
		for param1_element in param1 :

			value_pr=egg[param1_element][param3][param4-1] # value of the previous element, needed for offset
			### j ajoute -1 pour que random soit entre 0 et le plus grand indice de offset list qui est sa taille -1
			if next_value < param2["domain"]["values"]["min"]:
			elif next_value >param2["domain"]["values"]["max"]: 


	# quantitatif:dis :end

	# quantitatif:con : begin

	if param2['domain']['type']=="quantitatif:con":
		random = norm.rvs(size=len(param1))

		for param1_element in param1 :

			value_pr=egg[param1_element][param3][param4-1] # value of the previous element, needed for offset

				offset = (random[i]*param2['evolution']['offset']['distribution']['sigma'])+param2['evolution']['offset']['distribution']['mean']
				print '\n***************\n*\n*	Error: Check the configuration of property: ',param3,'\n*\n***************'
			if offset > param2['evolution']['offset']['max']:
				offset = param2['evolution']['offset']['max']
			elif offset < param2['evolution']['offset']['min']:
				offset = param2['evolution']['offset']['min']

			if next_value < param2["domain"]["values"]["min"]:
			elif next_value >param2["domain"]["values"]["max"]: 

	# quantitatif:con : end

	return egg
Exemplo n.º 38
def identify_reporter_metabolites(cobra_model,
    """Calculate the aggregate Z-score for the metabolites in the model.
    Ignore reactions that are solely spontaneous or orphan. Allow the scores to
    have multiple columns / experiments.   This will change the way the output
    is represented.

    cobra_model: A cobra.Model object

    TODO: CHANGE TO USING DICTIONARIES for the_reactions: the_scores

    reaction_scores_dict:  A dictionary where the keys are reactions in
    cobra_model.reactions and the values are the scores.  Currently, only
    supports a single numeric value as the value; however, this will be updated
    to allow for lists

    number_of_randomizations: Integer.  Number of random shuffles of the
    scores to assess which are significant.

    scoring_metric: default means divide by k**0.5

    score_type: 'p' Is the only option at the moment and indicates p-value.

    entire_network: Boolean. Currently, only compares scores calculated from

    background_correction: Boolean.  If True apply background correction to the
    aggreagate Z-score

    ignore_external_boundary_reactions: Not yet implemented. Boolean.  If True
    do not count exchange reactions when calculating the score.

    # Add in a function to calculate based on correlation coefficients and to
    # deal with other multidimensional data.
    the_reactions = reaction_scores_dict.keys()
    the_scores = reaction_scores_dict.values()
    if score_type == 'p' and not hasattr(the_scores[0], '__iter__'):
        # minimum and maximum p-values are used to prevent numerical problems.
        # haven't decided whether an arbitrary min / max 1e-15 is preferred to
        # blunting the ends based on the values closest to 0 or 1.
        the_reactions = reaction_scores_dict.keys()
        the_scores = array(reaction_scores_dict.values())
        minimum_p = min(the_scores[the_scores.nonzero()[0]])
        maximum_p = max(the_scores[where(the_scores < 1)[0]])
        the_scores[where(the_scores < minimum_p)] = minimum_p
        the_scores[where(the_scores > maximum_p)] = maximum_p
        the_scores = -norm.ppf(the_scores)
        # update the dictionary with the new scores
        reaction_scores_dict = dict(zip(the_reactions, the_scores))
    elif hasattr(the_scores[0], '__iter__'):
        # In the case that the_scores is a list of lists, assume that each list
        # is the score for each reaction in the_reactions across all reactions.
        # Then for each metabolite, calculate the invnorm(|Pearson Correlation
        # Coefficient| for each reaction pair that it links.
        raise Exception("This isn't implemented yet")

    # Get the connectivity for each metabolite
    the_metabolites = set()
    for x in reaction_scores_dict:

    metabolite_scores = {}
    metabolite_connections = {}
    # Calculate the score for each metabolite
    for the_metabolite in the_metabolites:
        nonspontaneous_connections = [
            x for x in the_metabolite._reaction
            if x.gene_reaction_rule.lower() not in ['s0001', '']
        tmp_score = 0
        number_of_connections = len(nonspontaneous_connections)
        for the_reaction in nonspontaneous_connections:
            if the_reaction not in reaction_scores_dict:
                if not entire_network:
                    number_of_connections -= 1
                tmp_score += reaction_scores_dict[the_reaction]
        metabolite_scores[the_metabolite] = tmp_score
        metabolite_connections[the_metabolite] = number_of_connections

    # NOTE: Doing the corrections based only on the significantly perturbed
    # scores is probably going to underestimate the significance.
    if background_correction:
        correction_dict = {}
        for i in set(metabolite_connections.values()):
            # if entire_network # add in a section to deal with the situation
            # where the entire network structure is considered by only have
            # p-values for a limited subset.
            # Basically, what we're doing here is that for each i we select i
            # scores number_of_randomizations times
            the_random_indices = randint.rvs(0,
            random_score_distribution = array(
                 for x in list(the_random_indices)]) / i**0.5
            correction_dict[i] = [
                std(random_score_distribution, ddof=1)

    for the_metabolite, the_score in iteritems(metabolite_scores):
        number_of_connections = metabolite_connections[the_metabolite]
        if number_of_connections > 0:
            # Correct based on background distribution
            if background_correction:
                # if the list of scores is only for significant perturbations
                # then the background correction shouldn't be applied because
                # the current sampling method only takes into account
                # the_scores not the entire network.  It'd be more accurate to
                # assign unscored reactions a default score.
                the_score = ((the_score / number_of_connections**.5) -
                             correction_dict[number_of_connections][0]) / \
                the_score = the_score / number_of_connections**.5
            # Update the score
            metabolite_scores[the_metabolite] = the_score

    return_dictionary = {
        'scores': metabolite_scores,
        'connections': metabolite_connections
    if background_correction:
        return_dictionary['corrections'] = correction_dict

    return return_dictionary
Exemplo n.º 39
def identify_reporter_metabolites(cobra_model, reaction_scores_dict,
                                  number_of_randomizations=1000, number_of_layers=1,
                                  scoring_metric='default', score_type='p',
                                  entire_network=False, background_correction=True,
    """Calculate the aggregate Z-score for the metabolites in the model.
    Ignore reactions that are solely spontaneous or orphan. Allow the scores to
    have multiple columns / experiments.   This will change the way the output
    is represented.

    cobra_model: A cobra.Model object

    TODO: CHANGE TO USING DICTIONARIES for the_reactions: the_scores

    reaction_scores_dict:  A dictionary where the keys are reactions in cobra_model.reactions
    and the values are the scores.  Currently, only supports a single numeric value as
    the value; however, this will be updated to allow for lists

    number_of_randomizations: Integer.  Number of random shuffles of the
    scores to assess which are significant.

    number_of_layers: 1 is the only option supported
    scoring_metric:  default means divide by k**0.5

    score_type: 'p' Is the only option at the moment and indicates p-value.

    entire_network: Boolean.  Currently, only compares scores calculated from the_reactions

    background_correction: Boolean.  If True apply background correction to the aggreagate

    ignore_external_boundary_reactions: Not yet implemented. Boolean.  If True do not count exchange reactions when
    calculating the score.


    #Add in a function to calculate based on correlation coefficients and to
    #deal with other multidimensional data. 
    the_reactions = reaction_scores_dict.keys()
    the_scores = reaction_scores_dict.values()
    if score_type == 'p' and not hasattr(the_scores[0], '__iter__'):
        #minimum and maximum p-values are used to prevent numerical problems.
        #haven't decided whether an arbitrary min / max 1e-15 is preferred to
        #blunting the ends based on the values closest to 0 or 1.
        the_reactions = reaction_scores_dict.keys()
        the_scores = array(reaction_scores_dict.values())
        minimum_p = min(the_scores[the_scores.nonzero()[0]])
        maximum_p = max(the_scores[where(the_scores < 1)[0]])
        the_scores[where(the_scores < minimum_p)] = minimum_p
        the_scores[where(the_scores > maximum_p)] = maximum_p
        the_scores = -norm.ppf(the_scores)
        #update the dictionary with the new scores
        reaction_scores_dict = dict(zip(the_reactions, the_scores))
    elif hasattr(the_scores[0], '__iter__'):
        #In the case that the_scores is a list of lists, assume that each list is
        #the score for each reaction in the_reactions across all reactions.  Then
        #for each metabolite, calculate the invnorm(|Pearson Correlation
        #Coefficient| for each reaction pair that it links.
        raise Exception("This isn't implemented yet")
    #Get the connectivity for each metabolite
    the_metabolites = set()
     for x in reaction_scores_dict];

    metabolite_scores = {}
    metabolite_connections = {}
    #Calculate the score for each metabolite
    for the_metabolite in the_metabolites:
        nonspontaneous_connections = [x for x in the_metabolite._reaction
                                      if x.gene_reaction_rule.lower() not in
                                      ['s0001', '']]
        tmp_score = 0
        number_of_connections = len(nonspontaneous_connections)
        for the_reaction in nonspontaneous_connections:
            if the_reaction not in reaction_scores_dict:
                if not entire_network:
                    number_of_connections -= 1
                tmp_score += reaction_scores_dict[the_reaction]
        metabolite_scores[the_metabolite] = tmp_score
        metabolite_connections[the_metabolite] = number_of_connections

    #NOTE: Doing the corrections based only on the significantly perturbed scores
    #is probably going to underestimate the significance.
    if background_correction:
        correction_dict = {}
        for i in set(metabolite_connections.values()):
            #if entire_network # add in a section to deal with the situation where
            #the entire network structure is considered by only have p-values for
            #a limited subset.
            #Basically, what we're doing here is that for each i we select i
            #scores number_of_randomizations times
            the_random_indices = randint.rvs(0,len(the_scores), size=(number_of_randomizations, i))
            random_score_distribution = array([sum(the_scores[x]) for x in list(the_random_indices)]) /i**0.5
            correction_dict[i] = [mean(random_score_distribution),

    for the_metabolite, the_score in metabolite_scores.iteritems():
        number_of_connections = metabolite_connections[the_metabolite]
        if number_of_connections > 0:
            #Correct based on background distribution
            if background_correction:
                #if the list of scores is only for significant perturbations then the
                #background correction shouldn't be applied because the current sampling
                #method only takes into account the_scores not the entire network.
                #It'd be more accurate to assign unscored reactions a default score.
                the_score = ((the_score / number_of_connections**.5) -
                             correction_dict[number_of_connections][0]) / \
                the_score = the_score / number_of_connections**.5
            #Update the score
            metabolite_scores[the_metabolite] = the_score

    return_dictionary = {'scores': metabolite_scores,
                         'connections': metabolite_connections}
    if background_correction:
        return_dictionary['corrections'] = correction_dict

Exemplo n.º 40
 def _fit(self, X, y):
     from scipy.stats import randint
     randidx = randint.rvs(0, len(y), size=10)
     counts = np.bincount(randidx)
     self.majority_ = np.argmax(counts)