示例#1
0
def train_MLP(min_size=5, max_size=100):
    model = MLPClassifier(max_iter=100)
    param_grid = {
        'alpha': [1],
        'max_iter': [1000],
        'solver': ['adam'],
        'activation': ['relu']
    }
    param_grid = {
        'hidden_layer_sizes':
        [(sp_randint.rvs(min_size,
                         max_size), sp_randint.rvs(min_size, max_size),
          sp_randint.rvs(min_size,
                         max_size), sp_randint.rvs(min_size, max_size)),
         (sp_randint.rvs(min_size,
                         max_size), sp_randint.rvs(min_size, max_size),
          sp_randint.rvs(min_size, max_size)),
         (sp_randint.rvs(min_size,
                         max_size), sp_randint.rvs(min_size, max_size)),
         (sp_randint.rvs(min_size, max_size), )],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam', 'lbfgs'],
        'alpha':
        sp_expon(scale=.01),
        'learning_rate': ['constant', 'adaptive'],
        'learning_rate_init':
        sp_expon(scale=.001),
    }
    return model, param_grid
def die_roll(loot_count):
    # roll the dice
    roll = randint.rvs(1, 4) + randint.rvs(1, 4)

    # reduce the roll by the amount of loot caried, but not below zero
    roll = roll - loot_count
    if roll < 0:
        roll = 0

    # return the modified roll
    return roll
示例#3
0
 def draw_generation(self):
     self.draws = []
     for i in range(self.times):
         draw_dict = {}
         draw_dict['beta_draw'] = uniform.rvs(0,1,500)
         draw_dict['bernoulli_draw'] = uniform.rvs(0,1,[500,1000])
         draw_dict['uniform_draw'] = uniform.rvs(0,1,500)
         draw_dict['host_draw'] = uniform.rvs(0,1,1000)
         draw_dict['size_state'] = randint.rvs(0,2000)
         draw_dict['initial_state'] = randint.rvs(0,2000)
         self.draws.append(draw_dict)      
示例#4
0
def r_funct(current_key,str_values):
	df_train = pd.DataFrame.from_records(str_values,columns = df_columns)	
	'''
	#0. convert to proper dtypes
	for col,coltype in data_type_dict.iteritems():
		if coltype=='int64':
			df_train[col] = df_train[col].astype(int)
		if coltype=='float64':
			df_train[col] = df_train[col].astype(float)
	'''
	#1. remove constant columns
	remove = []
	for col in df_train.columns:
		if df_train[col].std() == 0:
			remove.append(col)
	df_train = df_train.drop(remove, axis=1)
	
	#2. remove duplicated columns
	remove = []
	c = df_train.columns
	for i in range(len(c)-1):
		v = df_train[c[i]].values
		for j in range(i+1,len(c)):
			if np.array_equal(v,df_train[c[j]].values):
				remove.append(c[j])
	df_train = df_train.drop(remove, axis=1)

	#REMOVE UNWANTED COLUMNS
	y_train = df_train['TARGET'].values
	X_train = df_train.drop(['ID','TARGET'], axis=1).values

	# params for this randomforest
	len_train = len(X_train)
	learning_rate=random.choice([1,.5,.3,.2,.1,.03,.05,.01,.005,.001,.0005,.0001,.00001])
	n_estimators=sp_randint.rvs(100, 5000)
	subsample=random.choice([1,.95,.85,.90,.8])
	min_samples_split=sp_randint.rvs(2, 11)
	min_samples_leaf=sp_randint.rvs(1, 11)
	max_depth=sp_randint.rvs(2, 20)
	min_weight_fraction_leaf=0


	# kfold cross validation for train data using randomforest.
	clf = GradientBoostingClassifier(learning_rate=learning_rate,n_estimators=n_estimators,subsample=subsample,min_samples_split=min_samples_split,min_samples_leaf=min_samples_leaf,max_depth=max_depth)
	k_fold = cross_validation.KFold(len_train, 5)
	auc_scores_list = []
	for k, (train, test) in enumerate(k_fold):
		clf.fit(X_train[train], y_train[train])
		auc_scr = auc_score(y_train[test], clf.predict_proba(X_train[test])[:,1])
		auc_scores_list.append(auc_scr)
	mean = np.mean(auc_scores_list)
	std = np.std(auc_scores_list)

	print "GBT:learning_rate:%s,n_estimators:%s,subsample:%s,min_samples_split:%s,min_samples_leaf:%s,min_weight_fraction_leaf:%s,max_depth:%s,mean:%s,std:%s" %(learning_rate,n_estimators,subsample,min_samples_split,min_samples_leaf,min_weight_fraction_leaf,max_depth,mean,std)
示例#5
0
 def __benchmarkfun(self, talent, benchmarksize):
     R1 = randint.rvs(1, 20, size=benchmarksize)
     R2 = randint.rvs(1, 20, size=benchmarksize)
     R3 = randint.rvs(1, 20, size=benchmarksize)
     R = zip(R1, R2, R3)
     testcount = 1
     testresults = []
     for throw in R:
         testcount += 1
         testresults.append(self.test_silent(talent, throw[0], throw[1], throw[2]))
     return testresults
示例#6
0
def supermarket_log(starting_time, finish_time, warehouse,
                    file):  # one day operation
    """
    Simulating one day of restock and sells in a supermarket, the events in the supermarket follow an exponential
    distribution with an average time between events of 5 minutes. Each time that an event occur, the next event
    (restock or sell) is chosen with a binomial distribution where a sell has probability 0.65 and a restock 0.35.
    When a client buy a product, that product is selected randomly uniformly, whereas the quantity is chosen
    from a binomial with n=(max quantity of the product chosen) and p=0.15
    We are making one restock at once, each time that a restock is made the product selected is randomly uniformly
    chosen, meanwhile the quantity of the product to restock is chosen from a binomial where
    n=(max quantity allowed in shelves), p=0.65

    Parameters
    ----------
    starting_time:
        supermarket opening time
    finish_time:
        supermarket closing time
    warehouse:
        class Warehouse where our products catalog is saved, we need this information to know products and their codes
        in our supermarket
    file:
        file path in which save our daily log
    """

    log = []
    last_hour = starting_time
    while last_hour < finish_time:  # our loop finish when the last transaction has passed finish_time
        if binom.rvs(1, 0.65):
            product_chosen = list(
                warehouse.products.keys())[randint.rvs(1, 19) - 1]
            last_hour += timedelta(minutes=float(expon.rvs(scale=5, size=1)))
            aux = [
                'venta', last_hour, product_chosen,
                binom.rvs(n=warehouse[product_chosen][0], p=0.15, loc=1)
            ]
            log.append(aux)
        else:
            last_hour += timedelta(minutes=float(expon.rvs(scale=5, size=1)))
            product_chosen = list(warehouse.products.keys())[randint.rvs(
                0,
                len(amazon.products) - 1)]
            log.append([
                'repo', last_hour, product_chosen,
                binom.rvs(n=warehouse[product_chosen][0], p=0.65, loc=1)
            ])
    with open(file, 'w') as f:
        text = ""
        for el in log:
            text += el[0] + ' ' + format_date(el[1]) + " " + el[2] + " " + str(
                el[3]) + "\n"
        f.write(text)
示例#7
0
 def rvs(self):
     if not self.size:
         self.size = randint.rvs(low = self.min_size, high = self.max_size, size = 1)
     if self.scale:
         return expon.rvs(loc = self.loc * 0.09, scale = self.scale, size = self.size)
     else:
         return expon.rvs(loc = self.loc * 0.09, scale = self.loc * 8.0, size = self.size)
def metadata_filename():
    columns = [
        "thermostat_id",
        "equipment_type",
        "zipcode",
        "utc_offset",
        "interval_data_filename",
    ]

    n_thermostats = 100
    thermostat_ids = [uuid4() for i in range(n_thermostats)]
    equipment_types = randint.rvs(0, 6, size=n_thermostats)
    zipcodes = [
        "70754", "70722", "70726", "70449", "70442", # "722312" 50
        "70443", "70441", "70446", "70447", "70444", # "722312"
        "70836", "70778", "70770", "70774", "70777", # "722312"
        "70433", "70437", "70436", "70435", "70438", # "722312"
        "70744", "70748", "70462", "70465", "70466", # "722312"
        "70791", "70714", "70711", "70451", "70450", # "722312"
        "70453", "70455", "70454", "70456", "70809", # "722312"
        "70806", "70807", "70805", "70769", "70761", # "722312"
        "70402", "70403", "70401", "70737", "70730", # "722312"
        "70733", "70739", "70785", "70789", "70706", # "722312"
        "45341", "45344", "45349", "45319", "45434", # "745700" 55
        "60018", "60191", "60193", "60195", "60194", # "725300" 60
        "97473", "97449", "97493", "97467", "97459", # "726917" 65
        "60421", "60544", "60404", "60408", "60481", # "725345" 70
        "36590", "36564", "36606", "36605", "36532", # "722235" 75
        "36541", "36544", "36568", "36608", "36609", # "722230" 80
        "23106", "23060", "23229", "23222", "23294", # "724029" 85
        "13674", "13601", "13606", "13605", "13682", # "726227" 90
        "12978", "12972", "12985", "12903", "12901", # "726225" 95
        "61051", # "725326" 96
        "76207", # "722589" 97
        "36362", # "722239" 98
        "57233", # "726546" 99
        "56289", # "726547" 100
    ]
    utc_offsets = [-7 for _ in range(n_thermostats)]
    interval_data_filenames = ["thermostat_{}.csv".format(i) for i in thermostat_ids]

    df = pd.DataFrame({
        "thermostat_id": thermostat_ids,
        "equipment_type": equipment_types,
        "zipcode": zipcodes,
        "utc_offset": utc_offsets,
        "interval_data_filename": interval_data_filenames,
    }, columns=columns)


    temp_dir = tempfile.mkdtemp()
    metadata_filename = os.path.join(temp_dir, "metadata.csv")
    df.to_csv(metadata_filename, index=False)

    for interval_data_filename in df.interval_data_filename:
        fname = os.path.join(temp_dir, interval_data_filename)
        with open(fname, 'w') as f :
            f.write("INTERVAL DATA FILE CONTENT")

    return metadata_filename
示例#9
0
 def draw(self, K = 10, N = 1*10**5, m = 3, gaussian = False):
     
     if self.seed is not None:
         np.random.seed(self.seed)
  
     alphas = gamma.rvs(5, size=m)               # shape parameter
     #print(sum(alphas))                              # equivalent sample size
     self.p = dirichlet.rvs(alpha = alphas, size = 1)[0]
     self.phi_is = multinomial.rvs(1, self.p, size=N)       # draw from categorical p.m.f
     
     self.x_draws = np.zeros((N,K))
     self.hyper_loc, self.hyper_scale, self.thetas, self.var, self.covs, self.rdraws = dict(), dict(), dict(), tuple(), tuple(), tuple()
     
     for i in range(m):
     
           self.hyper_loc["mean"+str(i+1)] = norm.rvs(size = 1, loc = 0, scale = 5)
           self.hyper_scale["scale"+str(i+1)] = 1/gamma.rvs(5, size=1)
           
           self.thetas["mean"+str(i+1)] = norm.rvs(size = K, loc = self.hyper_loc["mean"+str(i+1)], 
                       scale = self.hyper_scale["scale"+str(i+1)])
           self.thetas["Sigma"+str(i+1)] = np.eye(K)*(1/gamma.rvs(5, size=K))
           self.thetas["nu"+str(i+1)] = randint.rvs(K+2, K+10, size=1)[0]
     
           if gaussian:
              self.covs += (self.thetas['Sigma'+str(i+1)], )
           else:
              self.covs += (wishart.rvs(df = self.thetas['nu'+str(i+1)], scale = self.thetas['Sigma'+str(i+1)], size=1),)
              self.var += (self.thetas["nu"+str(i+1)]/(self.thetas["nu"+str(i+1)]-2)*self.covs[i],)       # variance covariance matrix of first Student-t component
           self.rdraws += (np.random.multivariate_normal(self.thetas["mean"+str(i+1)], self.covs[i], N),)
     
           self.Phi = np.tile(self.phi_is[:,i], K).reshape(K,N).T              # repeat phi vector to match with random matrix
           self.x_draws += np.multiply(self.Phi, self.rdraws[i])                
     return self.x_draws
示例#10
0
def death_drop(inventory):
    # create a vector of zeroes
    inventory_mask = np.zeros(len(inventory))

    # select a token at random
    inventory_mask[randint.rvs(0, len(inventory))] = 1

    return inventory_mask
示例#11
0
def main(N, fl):
    X = randint.rvs(2, 65536, size=N)
    print(X)
    #fl = "SeqS.in"
    fd = open(fl, 'w')
    for x in X:
        fd.write(f'{x}\n')
    fd.close()
示例#12
0
    def draw(self, K=10, N=1 * 10**5, m=3, gaussian=False):
        """
        Inputs:
        -------
        N: sample size
        K: Dimension of Normal/Student distr.
        m: number of mixture components
        """
        np.random.seed(self.seed)
        self.st0 = np.random.get_state()  # get initial state of RNG
        #np.random.set_state(self.st0)
        print("Drawing from", m, "component mixture distribution.")
        alphas = gamma.rvs(5, size=m)  # shape parameter
        #print(sum(alphas))                              # equivalent sample size
        self.p = dirichlet.rvs(alpha=alphas, size=1)[0]
        self.phi_is = multinomial.rvs(1, self.p,
                                      size=N)  # draw from categorical p.m.f

        self.x_draws = np.zeros((N, K))
        self.hyper_loc, self.hyper_scale, self.thetas, self.var, self.covs, self.rdraws = dict(
        ), dict(), dict(), tuple(), tuple(), tuple()

        for i in range(m):

            self.hyper_loc["mean" + str(i + 1)] = norm.rvs(size=1,
                                                           loc=0,
                                                           scale=5)
            self.hyper_scale["scale" + str(i + 1)] = 1 / gamma.rvs(5, size=1)

            self.thetas["mean" + str(i + 1)] = norm.rvs(
                size=K,
                loc=self.hyper_loc["mean" + str(i + 1)],
                scale=self.hyper_scale["scale" + str(i + 1)])
            self.thetas["Sigma" +
                        str(i + 1)] = np.eye(K) * (1 / gamma.rvs(5, size=K))
            self.thetas["nu" + str(i + 1)] = randint.rvs(K + 2, K + 10,
                                                         size=1)[0]

            if gaussian:
                self.covs += (self.thetas['Sigma' + str(i + 1)], )
            else:
                self.covs += (wishart.rvs(df=self.thetas['nu' + str(i + 1)],
                                          scale=self.thetas['Sigma' +
                                                            str(i + 1)],
                                          size=1), )
                self.var += (
                    self.thetas["nu" + str(i + 1)] /
                    (self.thetas["nu" + str(i + 1)] - 2) * self.covs[i],
                )  # variance covariance matrix of first Student-t component
            self.rdraws += (np.random.multivariate_normal(
                self.thetas["mean" + str(i + 1)], self.covs[i], N), )

            self.Phi = np.tile(self.phi_is[:, i], K).reshape(
                K, N).T  # repeat phi vector to match with random matrix
            self.x_draws += np.multiply(self.Phi, self.rdraws[i])

        return self.x_draws, np.argmax(self.phi_is, 1)  # X, latent
def compare_models(player_agent_1, player_agent_2, games=1000):
    # initiate counter to track win totals
    wins = np.zeros(2)

    for i in range(games):
        # initiate a new game state
        game = GameState(6)

        # start turn counter; declare a flag for game end
        turn = 0
        continue_game = True

        # randomly assign player_agent_1 and player_agent_2 to even/odd turns
        player_agent_1_turns = randint.rvs(0, 2)

        # take turns until the game is over
        while continue_game:
            # use player_agent_1 on half the turns
            if turn % 2 == player_agent_1_turns:
                # take a turn
                continue_game, turn_taken = take_turn(game,
                                                      player_agent_1)[0:2]

            # use player_agent_2 on the other half
            else:
                # take a turn
                continue_game, turn_taken = take_turn(game,
                                                      player_agent_2)[0:2]

            # skip to next turn if active player is already back at the sub
            if turn_taken == False:
                # update the active player
                if game.active_player < game.players - 1:
                    game.active_player += 1
                else:
                    game.active_player = 0

                # next turn
                continue

            # update the active player
            if game.active_player < game.players - 1:
                game.active_player += 1
            else:
                game.active_player = 0

            # increment turn counter
            turn += 1

        # document the winner
        if np.argmax(game.player_scores) % 2 == player_agent_1_turns:
            wins[0] += 1
        else:
            wins[1] += 1

    # return the win totals
    return wins
示例#14
0
 def test_gaussiankde_arguments(self):
     size = 1000
     low = 0
     high = 9
     data = randint.rvs(low, high, size=size) + norm.rvs(0, 0.1, size=size)
     dist = GaussianMultivariate(distribution=GaussianKDE(bw_method=0.01))
     dist.fit(data)
     samples = dist.sample(size).to_numpy()[0]
     d, p = ks_2samp(data, samples)
     assert p >= 0.05
示例#15
0
 def get_seed_value(self, new_seed=False):
     # returns value of seed specified in the text_widget
     # if that value is zero, return the seed stored in self.seed_value
     # if that value is zero, produce a new seed if new_seed is true
     value = self.seed_text_widget.value
     if value == 0:
         if new_seed:
             self.seed_value = randint.rvs(100000, 999999)
         value = self.seed_value
     return value
    def treatment_effect(self, X=None, y=None, t=None):
        self.X_ = X
        self.y_ = y
        self.t_ = t

        fold_seeds = randint.rvs(0, 1000, size=self.n_splits,
                                 random_state=0).tolist()
        treatment_effect = self._dml_estimation(fold_seeds)
        self._is_estimated = True

        return treatment_effect
def pick_one_numbers_uniformly(low, high):
    """
    Retourne un entier précisé par size entre low et high
    :param int low: plus pétit élément probable
    :param int high: plus grand élément probable
    :param int length:  taille de l'échantillon
    :return int item:
    """
    item = low
    if low != high:
        item = list(randint.rvs(low, high, size=1))[0]
    return item
 def rvs(self):
     if not self.size:
         self.size = randint.rvs(low=self.min_size,
                                 high=self.max_size,
                                 size=1)
     if self.scale:
         return expon.rvs(loc=self.loc * 0.09,
                          scale=self.scale,
                          size=self.size)
     else:
         return expon.rvs(loc=self.loc * 0.09,
                          scale=self.loc * 8.0,
                          size=self.size)
示例#19
0
文件: fit.py 项目: antcc/proyecto-aa
    def rvs(self, random_state=42):
        if len(self.seen) < self.high - self.low - 1:
            while True:
                sample = randint.rvs(self.low,
                                     self.high,
                                     size=1,
                                     random_state=random_state)[0]

                if sample not in self.seen:
                    self.seen.append(sample)
                    return self.size * (sample, )

        return self.size * (0, )
示例#20
0
    def wild_bootstrap(self, beta_null, var):
        # beta_null is the null hypothesis for var
        X1 = self.df[var].to_numpy()
        # get a list of all variables that are not rel time 1 dummy
        Xvars_no1 = [v for v in self.xvars if v != var[0]]
        # perform a regression without rel time 1 dummy
        Y1 = self.Y - beta_null * X1
        Xno1 = self.df[Xvars_no1].to_numpy()
        beta1 = np.linalg.solve(Xno1.T.dot(Xno1), Xno1.T.dot(Y1))
        # use the beta to construct the Us
        U = Y1 - np.dot(Xno1, beta1)
        # boostrap the Us
        rand_sign = 2 * randint.rvs(0, 2, size=self.N).reshape(self.N, 1) - 1
        newU = np.multiply(U, rand_sign)
        # construct the wild Y
        Ywild = np.dot(Xno1, beta1) + X1 * beta_null + newU
        # get the new beta from the wild Y
        beta_wild = np.dot(self.XpXi, np.dot(self.X.T, Ywild))

        error = Ywild - np.dot(self.X, beta_wild)
        # and now the clustered-robust std error with similar procedure
        # as above
        clustervars = ['index']
        newdf = copy.copy(self.df)
        predictedX = np.sum(np.multiply(self.theta.T, self.X), 1).to_frame()
        predictedX = predictedX.rename(columns={0: "Yhat"})
        # add the residual to the dataframe
        df_withresid = newdf
        newdf['e'] = error

        # .assign(e = lambda x: x[self.yvar] - \
        # 	predictedX.Yhat)
        df_withresid[clustervars] = self.df[clustervars]
        # group by the cluster
        groups = df_withresid.groupby(clustervars)
        G = len(groups)
        robust_sum = 0
        # cycle through each cluster and create cluster-specific "meat"
        for key, item in groups:
            Xgroup = item[self.xvars].to_numpy()
            egroup = item['e'].to_numpy()
            egroup = egroup.reshape(len(egroup), 1)
            cluster_sum = np.matmul(np.matmul(np.matmul(Xgroup.T, egroup), \
             egroup.T), Xgroup)
            robust_sum += cluster_sum
        # correct for degrees of freedom
        deg_freedom = (G / (G - 1)) * ((self.N - 1) / (self.N - self.k))
        # sandwich together with the bread defined in the class initialization
        V = deg_freedom * np.matmul(np.matmul(self.XpXi.T, robust_sum), \
         self.XpXi)
        return np.sqrt(np.diag(V))
示例#21
0
    def singlevisualize(self,result):
        pylab.rcParams['figure.figsize'] = (30.0, 20.0)
        for k in result.keys():
            avo=plt.subplot(131)
            fr=plt.subplot(132)
            con=plt.subplot(133)
           
            avo.plot(self.keywords[k],[x['AverageOpinion'] for x in result[k]],label='AverageOpinion',marker='o')
            avo.plot(self.keywords[k],[x['parameter']['bm'] for x in result[k]],label='host-m',marker='1')
            avo.plot(self.keywords[k],[x['parameter']['bl'] for x in result[k]],label='host-l',marker='2')
            avo.plot(self.keywords[k],[x['AverageHost'] for x in result[k]],label='host-ave',marker='3')
            
            con.plot(self.keywords[k],[x['ConversionRatio'][0] for x in result[k]],label='PosToNeg',marker='2')
            con.plot(self.keywords[k],[x['ConversionRatio'][1] for x in result[k]],label='NegToPos',marker='1')
            
            for i in result[k]:
                label=self.conversion[k]+'='+str(round(i['parameter'][k],2))
                xaxis=[(i['FinalRatio'][1][x]+i['FinalRatio'][1][x+1])/2 for x in range(0,20)]
                yaxis=[y/(500*self.times) for y in i['FinalRatio'][0]]
                fr.plot(xaxis,yaxis,label=label,marker=randint.rvs(0,10))
                #tempdf=pd.DataFrame({legend:i['FinalRatio']})
                #sns.kdeplot(tempdf[legend],ax=fr,bw=0.05)
                
            avo.set_xlim([min(self.keywords[k]),max(self.keywords[k])])
            avo.set_ylim([-1,1])
            fr.set_xlim([-1,1])
            #fr.set_ylim([0,1])
            con.set_xlim([min(self.keywords[k]),max(self.keywords[k])])
            con.set_ylim([0,1])
            
            avo.set_xlabel(self.conversion[k])
            avo.set_ylabel('Average Opinion')
            fr.set_xlabel('Opinion Value')
            fr.set_ylabel('Final Ratio')
            con.set_xlabel(self.conversion[k])
            con.set_ylabel('Conversion Ratio')
            
            avo.legend()
            fr.legend()
            con.legend()
            
            plt.show()
            

            result[k]=pool.map(self.replication,parameterpool[k])
            
        return result#result is a dictionary
示例#22
0
 def flow(self, member, queue_node):
     prob_staying_general = 1 - Network.probability_of_leaving
     prob_stay_level = (1 - self.proportion_leave[member.level])*prob_staying_general   # Added prob of leaving per level
     stay = bernoulli.rvs(prob_stay_level)
     if member.level == 8:   # There's also probability they will leave
         #self.exit(member)
         return
     elif not stay:
         #self.exit(member)
         return
     else:
         next_level = member.level + 1
         # Determine the next node to visit
         queue_choice = None
         min_cost = 0
         wait_cost = 0
         service_cost = 0
         if self.asn_policy == 'Deterministic Wait':
             edges = queue_node.outgoing_edges
             min_cost = edges[0].get_wait_cost(member) + 1   # At the least, queue 0 has lower cost
             for q in range(0, len(edges)):
                 if min_cost > edges[q].get_wait_cost(member):
                     min_cost = edges[q].get_wait_cost(member)
                     queue_choice = edges[q].exit
                     wait_cost = min_cost
                     service_cost = edges[q].get_service_cost(member)
         if self.asn_policy == 'Deterministic Service':
             edges = queue_node.outgoing_edges
             min_cost = edges[0].get_service_cost(member) + 1   # At the least, queue 0 has lower cost
             for q in range(0, len(edges)):
                 if min_cost > edges[q].get_service_cost(member):
                     min_cost = edges[q].get_service_cost(member)
                     queue_choice = edges[q].exit
                     service_cost = min_cost
                     wait_cost = edges[q].get_wait_cost(member)
         if self.asn_policy == 'Uniform':
             choice_total = len(self.network[next_level])
             queue_index = randint.rvs(1, choice_total, size=1)[0]
             queue_choice = self.network[next_level][queue_index]
         yield self.env.process(member.request(queue_choice, queue_node, wait_cost, service_cost, self.env))
         # if we are still in the network
         self.flow(member, queue_choice)
示例#23
0
    def push(self, member):
        arriving_level = member.level
        # For now, we choose any node in level N+1
        if member.level == 8:
            #self.exit(member)
            return
        p = 1 - self.proportion_leave[arriving_level]   # Probability they are assessed and leave
        stay = bernoulli.rvs(p)
        if not stay:
            #self.exit(member)
            return
        arrival_node = self.network[arriving_level][0]  # Arriving node
        queue_choice = None
        wait_cost = 0
        service_cost = 0
        min_cost = 0
        if self.asn_policy == 'Deterministic Wait':
            edges = arrival_node.edges
            min_cost = edges[0].get_wait_cost(member) + 1   # At the least, queue 0 has lower cost
            for q in range(0, len(edges)):
                if min_cost > edges[q].get_wait_cost(member):
                    min_cost = edges[q].get_wait_cost(member)
                    wait_cost = min_cost
                    service_cost = edges[q].get_service_cost(member)
                    queue_choice = edges[q].exit
        if self.asn_policy == 'Deterministic Service':
            edges = arrival_node.edges
            min_cost = edges[0].get_service_cost(member) + 1   # At the least, queue 0 has lower cost
            for q in range(0, len(edges)):
                if min_cost > edges[q].get_service_cost(member):
                    min_cost = edges[q].get_service_cost(member)
                    queue_choice = edges[q].exit
                    wait_cost = edges[q].get_wait_cost(member)
                    service_cost = min_cost
        if self.asn_policy == 'Uniform':
            choice_total = len(self.network[arriving_level+1])
            queue_index = randint.rvs(1, choice_total, size=1)[0]
            queue_choice = self.network[arriving_level+1][queue_index]

        yield self.env.process(member.request(queue_choice, arrival_node, wait_cost, service_cost, self.env))
        self.env.process(self.flow(member, queue_choice))
示例#24
0
 def tree_sim(self, cur_state, action):
     if cur_state is self.death:
         if action is self.cut:
             next_state = self.sappling_height
             reward = -self.replanting_cost
         else:
             next_state = self.death
             reward = 0
     else:
         if action is self.cut:
             next_state = self.sappling_height
             reward = self.linear_wood_value * cur_state - self.replanting_cost
         else:
             tree_is_dying = bernoulli.rvs(self.proba_of_dying)
             if tree_is_dying:
                 next_state = self.death
                 reward = -self.maintenance_cost
             else:
                 next_state = randint.rvs(cur_state, self.max_height + 1)
                 reward = -self.maintenance_cost
     return next_state, reward
示例#25
0
def compare_models(model_set_1, model_set_2, games=1000, noise=0.1):
    # initiate counter to track win totals
    wins = np.zeros(2)

    for i in range(games):
        # initiate a new game state
        game = GameState(6)

        # start turn counter; declare a flag for game end
        turn = 0
        continue_game = True

        # randomly assign model_set_1 and model_set_2 to even/odd turns
        model_set_1_turns = randint.rvs(0, 1)

        # take turns until the game is over
        while continue_game:
            # use model_set_1 on half the turns
            if turn % 2 == model_set_1_turns:
                # take a turn
                continue_game, turn_taken, turn_around, pick_up, drop = take_turn(
                    game, *model_set_1, noise)

            # use model_set_2 on the other half
            else:
                # take a turn
                continue_game, turn_taken, turn_around, pick_up, drop = take_turn(
                    game, *model_set_2, noise)

            # increment turn counter
            turn += 1

        # document the winner
        if np.argmax(game.player_scores) % 2 == model_set_1_turns:
            wins[0] += 1
        else:
            wins[1] += 1

    # return the win totals
    return wins
示例#26
0
    def drop_decision(self, gamestate):
        # decide if the model output or a random guess will be used
        if uniform.rvs(0, 1) <= self.epsilon:
            # no drop if inventory is empty
            if sum(gamestate[1:33] != -1) == 0:
                drop = 0
            else:
                # randomly decide whether to drop a token from those available
                drop = randint.rvs(0, sum(gamestate[1:33] != -1) + 1)
        else:
            # generate a Q-table for the current gamestate
            selected_action = self.pick_up_model.predict(
                np.reshape(gamestate, (1, gamestate.shape[0])))

            # take the action with the highest Q-value
            drop = np.argmax(selected_action[0:(sum(gamestate[1:33] != -1) +
                                                1)])
            drop = int(drop)

        # return the decision as an integer
        # 1-33 mean drop the corresponding item
        # 0 means no drop
        return drop
])

preprocessor = ColumnTransformer([
    ('numeric_transformer', numeric_transformer, numeric_features),
    ('categorical_transformer', categorical_transformer, categorical_features)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', classifier)
])

param_distributions = {
                        "classifier__learning_rate": uniform.rvs(0.0001, 0.1, size=x_rscv_n_iter),
                        "classifier__gamma" : uniform.rvs(0, 2, size=x_rscv_n_iter),
                        "classifier__max_depth": randint.rvs(2, 100, size=x_rscv_n_iter),
                        "classifier__colsample_bytree": uniform.rvs(0.1, 0.9, size=x_rscv_n_iter),
                        "classifier__subsample": uniform.rvs(0.1, 0.9, size=x_rscv_n_iter),
                        "classifier__reg_alpha": uniform.rvs(0, 0.9, size=x_rscv_n_iter),
                        "classifier__reg_lambda": uniform.rvs(0.0001, 5, size=x_rscv_n_iter),
                        "classifier__min_child_weight": randint.rvs(1, 7, size=x_rscv_n_iter),
                        "classifier__n_estimators": randint.rvs(100, 1000, size=x_rscv_n_iter)
                      }

search = RandomizedSearchCV(
    pipeline, param_distributions=param_distributions, n_iter=x_rscv_n_iter, scoring={'recall_score': recall_scorer, 'f1_score': f1_scorer}, 
    n_jobs=-1, cv=x_rscv, random_state=x_random_state, refit='f1_score', return_train_score=True)

search = search.fit(X, y)

print(datetime.now()," [3/4] Algorithmus hat zu Ende berechnet")
示例#28
0
def get_fake_output_df(n_columns):
    columns = [
        'sw_version',

        'ct_identifier',
        'equipment_type',
        'heating_or_cooling',
        'station',
        'zipcode',
        'climate_zone',

        'start_date',
        'end_date',

        'n_days_in_inputfile_date_range',
        'n_days_both_heating_and_cooling',
        'n_days_insufficient_data',
        'n_core_cooling_days',
        'n_core_heating_days',

        'baseline_percentile_core_cooling_comfort_temperature',
        'baseline_percentile_core_heating_comfort_temperature',
        'regional_average_baseline_cooling_comfort_temperature',
        'regional_average_baseline_heating_comfort_temperature',

        'percent_savings_baseline_percentile',
        'avoided_daily_mean_core_day_runtime_baseline_percentile',
        'avoided_total_core_day_runtime_baseline_percentile',
        'baseline_daily_mean_core_day_runtime_baseline_percentile',
        'baseline_total_core_day_runtime_baseline_percentile',
        '_daily_mean_core_day_demand_baseline_baseline_percentile',
        'percent_savings_baseline_regional',
        'avoided_daily_mean_core_day_runtime_baseline_regional',
        'avoided_total_core_day_runtime_baseline_regional',
        'baseline_daily_mean_core_day_runtime_baseline_regional',
        'baseline_total_core_day_runtime_baseline_regional',
        '_daily_mean_core_day_demand_baseline_baseline_regional',
        'mean_demand',
        'alpha',
        'tau',
        'mean_sq_err',
        'root_mean_sq_err',
        'cv_root_mean_sq_err',
        'mean_abs_err',
        'mean_abs_pct_err',

        'total_core_cooling_runtime',
        'total_core_heating_runtime',
        'total_auxiliary_heating_core_day_runtime',
        'total_emergency_heating_core_day_runtime',

        'daily_mean_core_cooling_runtime',
        'daily_mean_core_heating_runtime',

        'core_cooling_days_mean_indoor_temperature',
        'core_cooling_days_mean_outdoor_temperature',
        'core_heating_days_mean_indoor_temperature',
        'core_heating_days_mean_outdoor_temperature',
        'core_mean_indoor_temperature',
        'core_mean_outdoor_temperature',

        'rhu1_aux_duty_cycle',
        'rhu1_emg_duty_cycle',
        'rhu1_compressor_duty_cycle',

        'rhu1_00F_to_05F',
        'rhu1_05F_to_10F',
        'rhu1_10F_to_15F',
        'rhu1_15F_to_20F',
        'rhu1_20F_to_25F',
        'rhu1_25F_to_30F',
        'rhu1_30F_to_35F',
        'rhu1_35F_to_40F',
        'rhu1_40F_to_45F',
        'rhu1_45F_to_50F',
        'rhu1_50F_to_55F',
        'rhu1_55F_to_60F',

        'rhu1_less10F',
        'rhu1_10F_to_20F',
        'rhu1_20F_to_30F',
        'rhu1_30F_to_40F',
        'rhu1_40F_to_50F',
        'rhu1_50F_to_60F',

        'rhu1_00F_to_05F_aux_duty_cycle',
        'rhu1_05F_to_10F_aux_duty_cycle',
        'rhu1_10F_to_15F_aux_duty_cycle',
        'rhu1_15F_to_20F_aux_duty_cycle',
        'rhu1_20F_to_25F_aux_duty_cycle',
        'rhu1_25F_to_30F_aux_duty_cycle',
        'rhu1_30F_to_35F_aux_duty_cycle',
        'rhu1_35F_to_40F_aux_duty_cycle',
        'rhu1_40F_to_45F_aux_duty_cycle',
        'rhu1_45F_to_50F_aux_duty_cycle',
        'rhu1_50F_to_55F_aux_duty_cycle',
        'rhu1_55F_to_60F_aux_duty_cycle',

        'rhu1_less10F_aux_duty_cycle',
        'rhu1_10F_to_20F_aux_duty_cycle',
        'rhu1_20F_to_30F_aux_duty_cycle',
        'rhu1_30F_to_40F_aux_duty_cycle',
        'rhu1_40F_to_50F_aux_duty_cycle',
        'rhu1_50F_to_60F_aux_duty_cycle',

        'rhu1_00F_to_05F_emg_duty_cycle',
        'rhu1_05F_to_10F_emg_duty_cycle',
        'rhu1_10F_to_15F_emg_duty_cycle',
        'rhu1_15F_to_20F_emg_duty_cycle',
        'rhu1_20F_to_25F_emg_duty_cycle',
        'rhu1_25F_to_30F_emg_duty_cycle',
        'rhu1_30F_to_35F_emg_duty_cycle',
        'rhu1_35F_to_40F_emg_duty_cycle',
        'rhu1_40F_to_45F_emg_duty_cycle',
        'rhu1_45F_to_50F_emg_duty_cycle',
        'rhu1_50F_to_55F_emg_duty_cycle',
        'rhu1_55F_to_60F_emg_duty_cycle',

        'rhu1_less10F_emg_duty_cycle',
        'rhu1_10F_to_20F_emg_duty_cycle',
        'rhu1_20F_to_30F_emg_duty_cycle',
        'rhu1_30F_to_40F_emg_duty_cycle',
        'rhu1_40F_to_50F_emg_duty_cycle',
        'rhu1_50F_to_60F_emg_duty_cycle',

        'rhu1_00F_to_05F_compressor_duty_cycle',
        'rhu1_05F_to_10F_compressor_duty_cycle',
        'rhu1_10F_to_15F_compressor_duty_cycle',
        'rhu1_15F_to_20F_compressor_duty_cycle',
        'rhu1_20F_to_25F_compressor_duty_cycle',
        'rhu1_25F_to_30F_compressor_duty_cycle',
        'rhu1_30F_to_35F_compressor_duty_cycle',
        'rhu1_35F_to_40F_compressor_duty_cycle',
        'rhu1_40F_to_45F_compressor_duty_cycle',
        'rhu1_45F_to_50F_compressor_duty_cycle',
        'rhu1_50F_to_55F_compressor_duty_cycle',
        'rhu1_55F_to_60F_compressor_duty_cycle',

        'rhu1_less10F_compressor_duty_cycle',
        'rhu1_10F_to_20F_compressor_duty_cycle',
        'rhu1_20F_to_30F_compressor_duty_cycle',
        'rhu1_30F_to_40F_compressor_duty_cycle',
        'rhu1_40F_to_50F_compressor_duty_cycle',
        'rhu1_50F_to_60F_compressor_duty_cycle',

        'rhu2_aux_duty_cycle',
        'rhu2_emg_duty_cycle',
        'rhu2_compressor_duty_cycle',

        'rhu2_00F_to_05F',
        'rhu2_05F_to_10F',
        'rhu2_10F_to_15F',
        'rhu2_15F_to_20F',
        'rhu2_20F_to_25F',
        'rhu2_25F_to_30F',
        'rhu2_30F_to_35F',
        'rhu2_35F_to_40F',
        'rhu2_40F_to_45F',
        'rhu2_45F_to_50F',
        'rhu2_50F_to_55F',
        'rhu2_55F_to_60F',

        'rhu2_less10F',
        'rhu2_10F_to_20F',
        'rhu2_20F_to_30F',
        'rhu2_30F_to_40F',
        'rhu2_40F_to_50F',
        'rhu2_50F_to_60F',

        'rhu2_00F_to_05F_aux_duty_cycle',
        'rhu2_05F_to_10F_aux_duty_cycle',
        'rhu2_10F_to_15F_aux_duty_cycle',
        'rhu2_15F_to_20F_aux_duty_cycle',
        'rhu2_20F_to_25F_aux_duty_cycle',
        'rhu2_25F_to_30F_aux_duty_cycle',
        'rhu2_30F_to_35F_aux_duty_cycle',
        'rhu2_35F_to_40F_aux_duty_cycle',
        'rhu2_40F_to_45F_aux_duty_cycle',
        'rhu2_45F_to_50F_aux_duty_cycle',
        'rhu2_50F_to_55F_aux_duty_cycle',
        'rhu2_55F_to_60F_aux_duty_cycle',

        'rhu2_less10F_aux_duty_cycle',
        'rhu2_10F_to_20F_aux_duty_cycle',
        'rhu2_20F_to_30F_aux_duty_cycle',
        'rhu2_30F_to_40F_aux_duty_cycle',
        'rhu2_40F_to_50F_aux_duty_cycle',
        'rhu2_50F_to_60F_aux_duty_cycle',

        'rhu2_00F_to_05F_emg_duty_cycle',
        'rhu2_05F_to_10F_emg_duty_cycle',
        'rhu2_10F_to_15F_emg_duty_cycle',
        'rhu2_15F_to_20F_emg_duty_cycle',
        'rhu2_20F_to_25F_emg_duty_cycle',
        'rhu2_25F_to_30F_emg_duty_cycle',
        'rhu2_30F_to_35F_emg_duty_cycle',
        'rhu2_35F_to_40F_emg_duty_cycle',
        'rhu2_40F_to_45F_emg_duty_cycle',
        'rhu2_45F_to_50F_emg_duty_cycle',
        'rhu2_50F_to_55F_emg_duty_cycle',
        'rhu2_55F_to_60F_emg_duty_cycle',

        'rhu2_less10F_emg_duty_cycle',
        'rhu2_10F_to_20F_emg_duty_cycle',
        'rhu2_20F_to_30F_emg_duty_cycle',
        'rhu2_30F_to_40F_emg_duty_cycle',
        'rhu2_40F_to_50F_emg_duty_cycle',
        'rhu2_50F_to_60F_emg_duty_cycle',

        'rhu2_00F_to_05F_compressor_duty_cycle',
        'rhu2_05F_to_10F_compressor_duty_cycle',
        'rhu2_10F_to_15F_compressor_duty_cycle',
        'rhu2_15F_to_20F_compressor_duty_cycle',
        'rhu2_20F_to_25F_compressor_duty_cycle',
        'rhu2_25F_to_30F_compressor_duty_cycle',
        'rhu2_30F_to_35F_compressor_duty_cycle',
        'rhu2_35F_to_40F_compressor_duty_cycle',
        'rhu2_40F_to_45F_compressor_duty_cycle',
        'rhu2_45F_to_50F_compressor_duty_cycle',
        'rhu2_50F_to_55F_compressor_duty_cycle',
        'rhu2_55F_to_60F_compressor_duty_cycle',

        'rhu2_less10F_compressor_duty_cycle',
        'rhu2_10F_to_20F_compressor_duty_cycle',
        'rhu2_20F_to_30F_compressor_duty_cycle',
        'rhu2_30F_to_40F_compressor_duty_cycle',
        'rhu2_40F_to_50F_compressor_duty_cycle',
        'rhu2_50F_to_60F_compressor_duty_cycle',

    ]

    string_placeholder = ["PLACEHOLDER"] * n_columns
    zero_column = [0 if randint.rvs(0, 30) > 0 else  (None if randint.rvs(0, 2) > 0 else np.inf)
            for i in randint.rvs(0, 1, size=n_columns)]
    one_column = [1 if randint.rvs(0, 30) > 0 else  (None if randint.rvs(0, 2) > 0 else np.inf)
            for i in randint.rvs(0, 1, size=n_columns)]
    float_column = [i if randint.rvs(0, 30) > 0 else (None if randint.rvs(0, 2) > 0 else np.inf)
            for i in norm.rvs(size=n_columns)]
    zipcodes = ["01234", "12345", "23456", "34567", "43210", "54321", "65432", "76543"]
    zipcode_column = [i for i in islice(cycle(zipcodes), None, n_columns)]
    core_day_set_names = ["cooling_2012", "heating_2012-2013", "cooling_2013"]
    core_day_set_name_column = [i for i in islice(cycle(core_day_set_names), None, n_columns)]

    data = {
        'sw_version': string_placeholder,

        'ct_identifier': string_placeholder,
        'equipment_type': string_placeholder,
        'heating_or_cooling': core_day_set_name_column,
        'station': string_placeholder,
        'zipcode': zipcode_column,
        'climate_zone': string_placeholder,

        'start_date': datetime(2011, 1, 1),
        'end_date': datetime(2012, 1, 1),
        'n_days_both_heating_and_cooling': one_column,
        'n_days_in_inputfile_date_range': one_column,
        'n_days_insufficient_data': zero_column,
        'n_core_heating_days': one_column,

        'baseline_percentile_core_cooling_comfort_temperature': float_column,
        'baseline_percentile_core_heating_comfort_temperature': float_column,
        'regional_average_baseline_cooling_comfort_temperature': float_column,
        'regional_average_baseline_heating_comfort_temperature': float_column,

        'percent_savings_baseline_percentile': float_column,
        'avoided_daily_mean_core_day_runtime_baseline_percentile': float_column,
        'avoided_total_core_day_runtime_baseline_percentile': float_column,
        'baseline_daily_mean_core_day_runtime_baseline_percentile': float_column,
        'baseline_total_core_day_runtime_baseline_percentile': float_column,
        '_daily_mean_core_day_demand_baseline_baseline_percentile': float_column,
        'percent_savings_baseline_regional': float_column,
        'avoided_daily_mean_core_day_runtime_baseline_regional': float_column,
        'avoided_total_core_day_runtime_baseline_regional': float_column,
        'baseline_daily_mean_core_day_runtime_baseline_regional': float_column,
        'baseline_total_core_day_runtime_baseline_regional': float_column,
        '_daily_mean_core_day_demand_baseline_baseline_regional': float_column,
        'mean_demand': float_column,
        'alpha': float_column,
        'tau': float_column,
        'mean_sq_err': float_column,
        'root_mean_sq_err': float_column,
        'cv_root_mean_sq_err': float_column,
        'mean_abs_err': float_column,
        'mean_abs_pct_err': float_column,

        'total_core_cooling_runtime': float_column,
        'total_core_heating_runtime': float_column,
        'total_auxiliary_heating_core_day_runtime': float_column,
        'total_emergency_heating_core_day_runtime': float_column,

        'daily_mean_core_cooling_runtime': float_column,
        'daily_mean_core_heating_runtime': float_column,

        'core_cooling_days_mean_indoor_temperature': float_column,
        'core_cooling_days_mean_outdoor_temperature': float_column,
        'core_heating_days_mean_indoor_temperature': float_column,
        'core_heating_days_mean_outdoor_temperature': float_column,
        'core_mean_indoor_temperature': float_column,
        'core_mean_outdoor_temperature': float_column,

        'rhu1_aux_duty_cycle': float_column,
        'rhu1_emg_duty_cycle': float_column,
        'rhu1_compressor_duty_cycle': float_column,

        'rhu1_00F_to_05F': float_column,
        'rhu1_05F_to_10F': float_column,
        'rhu1_10F_to_15F': float_column,
        'rhu1_15F_to_20F': float_column,
        'rhu1_20F_to_25F': float_column,
        'rhu1_25F_to_30F': float_column,
        'rhu1_30F_to_35F': float_column,
        'rhu1_35F_to_40F': float_column,
        'rhu1_40F_to_45F': float_column,
        'rhu1_45F_to_50F': float_column,
        'rhu1_50F_to_55F': float_column,
        'rhu1_55F_to_60F': float_column,

        'rhu1_less10F': float_column,
        'rhu1_10F_to_20F': float_column,
        'rhu1_20F_to_30F': float_column,
        'rhu1_30F_to_40F': float_column,
        'rhu1_40F_to_50F': float_column,
        'rhu1_50F_to_60F': float_column,

        'rhu1_00F_to_05F_aux_duty_cycle': float_column,
        'rhu1_05F_to_10F_aux_duty_cycle': float_column,
        'rhu1_10F_to_15F_aux_duty_cycle': float_column,
        'rhu1_15F_to_20F_aux_duty_cycle': float_column,
        'rhu1_20F_to_25F_aux_duty_cycle': float_column,
        'rhu1_25F_to_30F_aux_duty_cycle': float_column,
        'rhu1_30F_to_35F_aux_duty_cycle': float_column,
        'rhu1_35F_to_40F_aux_duty_cycle': float_column,
        'rhu1_40F_to_45F_aux_duty_cycle': float_column,
        'rhu1_45F_to_50F_aux_duty_cycle': float_column,
        'rhu1_50F_to_55F_aux_duty_cycle': float_column,
        'rhu1_55F_to_60F_aux_duty_cycle': float_column,

        'rhu1_less10F_aux_duty_cycle': float_column,
        'rhu1_10F_to_20F_aux_duty_cycle': float_column,
        'rhu1_20F_to_30F_aux_duty_cycle': float_column,
        'rhu1_30F_to_40F_aux_duty_cycle': float_column,
        'rhu1_40F_to_50F_aux_duty_cycle': float_column,
        'rhu1_50F_to_60F_aux_duty_cycle': float_column,

        'rhu1_00F_to_05F_emg_duty_cycle': float_column,
        'rhu1_05F_to_10F_emg_duty_cycle': float_column,
        'rhu1_10F_to_15F_emg_duty_cycle': float_column,
        'rhu1_15F_to_20F_emg_duty_cycle': float_column,
        'rhu1_20F_to_25F_emg_duty_cycle': float_column,
        'rhu1_25F_to_30F_emg_duty_cycle': float_column,
        'rhu1_30F_to_35F_emg_duty_cycle': float_column,
        'rhu1_35F_to_40F_emg_duty_cycle': float_column,
        'rhu1_40F_to_45F_emg_duty_cycle': float_column,
        'rhu1_45F_to_50F_emg_duty_cycle': float_column,
        'rhu1_50F_to_55F_emg_duty_cycle': float_column,
        'rhu1_55F_to_60F_emg_duty_cycle': float_column,

        'rhu1_less10F_emg_duty_cycle': float_column,
        'rhu1_10F_to_20F_emg_duty_cycle': float_column,
        'rhu1_20F_to_30F_emg_duty_cycle': float_column,
        'rhu1_30F_to_40F_emg_duty_cycle': float_column,
        'rhu1_40F_to_50F_emg_duty_cycle': float_column,
        'rhu1_50F_to_60F_emg_duty_cycle': float_column,

        'rhu1_00F_to_05F_compressor_duty_cycle': float_column,
        'rhu1_05F_to_10F_compressor_duty_cycle': float_column,
        'rhu1_10F_to_15F_compressor_duty_cycle': float_column,
        'rhu1_15F_to_20F_compressor_duty_cycle': float_column,
        'rhu1_20F_to_25F_compressor_duty_cycle': float_column,
        'rhu1_25F_to_30F_compressor_duty_cycle': float_column,
        'rhu1_30F_to_35F_compressor_duty_cycle': float_column,
        'rhu1_35F_to_40F_compressor_duty_cycle': float_column,
        'rhu1_40F_to_45F_compressor_duty_cycle': float_column,
        'rhu1_45F_to_50F_compressor_duty_cycle': float_column,
        'rhu1_50F_to_55F_compressor_duty_cycle': float_column,
        'rhu1_55F_to_60F_compressor_duty_cycle': float_column,

        'rhu1_less10F_compressor_duty_cycle': float_column,
        'rhu1_10F_to_20F_compressor_duty_cycle': float_column,
        'rhu1_20F_to_30F_compressor_duty_cycle': float_column,
        'rhu1_30F_to_40F_compressor_duty_cycle': float_column,
        'rhu1_40F_to_50F_compressor_duty_cycle': float_column,
        'rhu1_50F_to_60F_compressor_duty_cycle': float_column,

        'rhu2_aux_duty_cycle': float_column,
        'rhu2_emg_duty_cycle': float_column,
        'rhu2_compressor_duty_cycle': float_column,

        'rhu2_00F_to_05F': float_column,
        'rhu2_05F_to_10F': float_column,
        'rhu2_10F_to_15F': float_column,
        'rhu2_15F_to_20F': float_column,
        'rhu2_20F_to_25F': float_column,
        'rhu2_25F_to_30F': float_column,
        'rhu2_30F_to_35F': float_column,
        'rhu2_35F_to_40F': float_column,
        'rhu2_40F_to_45F': float_column,
        'rhu2_45F_to_50F': float_column,
        'rhu2_50F_to_55F': float_column,
        'rhu2_55F_to_60F': float_column,

        'rhu2_less10F': float_column,
        'rhu2_10F_to_20F': float_column,
        'rhu2_20F_to_30F': float_column,
        'rhu2_30F_to_40F': float_column,
        'rhu2_40F_to_50F': float_column,
        'rhu2_50F_to_60F': float_column,

        'rhu2_00F_to_05F_aux_duty_cycle': float_column,
        'rhu2_05F_to_10F_aux_duty_cycle': float_column,
        'rhu2_10F_to_15F_aux_duty_cycle': float_column,
        'rhu2_15F_to_20F_aux_duty_cycle': float_column,
        'rhu2_20F_to_25F_aux_duty_cycle': float_column,
        'rhu2_25F_to_30F_aux_duty_cycle': float_column,
        'rhu2_30F_to_35F_aux_duty_cycle': float_column,
        'rhu2_35F_to_40F_aux_duty_cycle': float_column,
        'rhu2_40F_to_45F_aux_duty_cycle': float_column,
        'rhu2_45F_to_50F_aux_duty_cycle': float_column,
        'rhu2_50F_to_55F_aux_duty_cycle': float_column,
        'rhu2_55F_to_60F_aux_duty_cycle': float_column,

        'rhu2_less10F_aux_duty_cycle': float_column,
        'rhu2_10F_to_20F_aux_duty_cycle': float_column,
        'rhu2_20F_to_30F_aux_duty_cycle': float_column,
        'rhu2_30F_to_40F_aux_duty_cycle': float_column,
        'rhu2_40F_to_50F_aux_duty_cycle': float_column,
        'rhu2_50F_to_60F_aux_duty_cycle': float_column,

        'rhu2_00F_to_05F_emg_duty_cycle': float_column,
        'rhu2_05F_to_10F_emg_duty_cycle': float_column,
        'rhu2_10F_to_15F_emg_duty_cycle': float_column,
        'rhu2_15F_to_20F_emg_duty_cycle': float_column,
        'rhu2_20F_to_25F_emg_duty_cycle': float_column,
        'rhu2_25F_to_30F_emg_duty_cycle': float_column,
        'rhu2_30F_to_35F_emg_duty_cycle': float_column,
        'rhu2_35F_to_40F_emg_duty_cycle': float_column,
        'rhu2_40F_to_45F_emg_duty_cycle': float_column,
        'rhu2_45F_to_50F_emg_duty_cycle': float_column,
        'rhu2_50F_to_55F_emg_duty_cycle': float_column,
        'rhu2_55F_to_60F_emg_duty_cycle': float_column,

        'rhu2_less10F_emg_duty_cycle': float_column,
        'rhu2_10F_to_20F_emg_duty_cycle': float_column,
        'rhu2_20F_to_30F_emg_duty_cycle': float_column,
        'rhu2_30F_to_40F_emg_duty_cycle': float_column,
        'rhu2_40F_to_50F_emg_duty_cycle': float_column,
        'rhu2_50F_to_60F_emg_duty_cycle': float_column,

        'rhu2_00F_to_05F_compressor_duty_cycle': float_column,
        'rhu2_05F_to_10F_compressor_duty_cycle': float_column,
        'rhu2_10F_to_15F_compressor_duty_cycle': float_column,
        'rhu2_15F_to_20F_compressor_duty_cycle': float_column,
        'rhu2_20F_to_25F_compressor_duty_cycle': float_column,
        'rhu2_25F_to_30F_compressor_duty_cycle': float_column,
        'rhu2_30F_to_35F_compressor_duty_cycle': float_column,
        'rhu2_35F_to_40F_compressor_duty_cycle': float_column,
        'rhu2_40F_to_45F_compressor_duty_cycle': float_column,
        'rhu2_45F_to_50F_compressor_duty_cycle': float_column,
        'rhu2_50F_to_55F_compressor_duty_cycle': float_column,
        'rhu2_55F_to_60F_compressor_duty_cycle': float_column,

        'rhu2_less10F_compressor_duty_cycle': float_column,
        'rhu2_10F_to_20F_compressor_duty_cycle': float_column,
        'rhu2_20F_to_30F_compressor_duty_cycle': float_column,
        'rhu2_30F_to_40F_compressor_duty_cycle': float_column,
        'rhu2_40F_to_50F_compressor_duty_cycle': float_column,
        'rhu2_50F_to_60F_compressor_duty_cycle': float_column,
    }
    df = pd.DataFrame(data, columns=columns)
    return df
示例#29
0
文件: base.py 项目: mkhoin/sklearnlab
 def _fit(self, X, y):
     from scipy.stats import randint
     randidx = randint.rvs(0, len(y), size=10)
     counts = np.bincount(randidx)
     self.majority_ = np.argmax(counts)
示例#30
0
    def test_nchypergeom_wallenius_naive(self):
        # test against a very simple implementation

        np.random.seed(2)
        shape = (2, 4, 3)
        max_m = 100
        m1 = np.random.randint(1, max_m, size=shape)
        m2 = np.random.randint(1, max_m, size=shape)
        N = m1 + m2
        n = randint.rvs(0, N, size=N.shape)
        xl = np.maximum(0, n - m2)
        xu = np.minimum(n, m1)
        x = randint.rvs(xl, xu, size=xl.shape)
        w = np.random.rand(*x.shape) * 2

        def support(N, m1, n, w):
            m2 = N - m1
            xl = np.maximum(0, n - m2)
            xu = np.minimum(n, m1)
            return xl, xu

        @np.vectorize
        def mean(N, m1, n, w):
            m2 = N - m1
            xl, xu = support(N, m1, n, w)

            def fun(u):
                return u / m1 + (1 - (n - u) / m2)**w - 1

            return root_scalar(fun, bracket=(xl, xu)).root

        assert_allclose(nchypergeom_wallenius.mean(N, m1, n, w),
                        mean(N, m1, n, w),
                        rtol=2e-2)

        @np.vectorize
        def variance(N, m1, n, w):
            m2 = N - m1
            u = mean(N, m1, n, w)
            a = u * (m1 - u)
            b = (n - u) * (u + m2 - n)
            return N * a * b / ((N - 1) * (m1 * b + m2 * a))

        assert_allclose(nchypergeom_wallenius.stats(N, m1, n, w, moments='v'),
                        variance(N, m1, n, w),
                        rtol=5e-2)

        @np.vectorize
        def pmf(x, N, m1, n, w):
            m2 = N - m1
            xl, xu = support(N, m1, n, w)

            def integrand(t):
                D = w * (m1 - x) + (m2 - (n - x))
                res = (1 - t**(w / D))**x * (1 - t**(1 / D))**(n - x)
                return res

            def f(x):
                t1 = special_binom(m1, x)
                t2 = special_binom(m2, n - x)
                the_integral = quad(integrand,
                                    0,
                                    1,
                                    epsrel=1e-16,
                                    epsabs=1e-16)
                return t1 * t2 * the_integral[0]

            return f(x)

        pmf0 = pmf(x, N, m1, n, w)
        pmf1 = nchypergeom_wallenius.pmf(x, N, m1, n, w)

        atol, rtol = 1e-6, 1e-6
        i = np.abs(pmf1 - pmf0) < atol + rtol * np.abs(pmf0)
        assert (i.sum() > np.prod(shape) / 2)  # works at least half the time

        # for those that fail, discredit the naive implementation
        for N, m1, n, w in zip(N[~i], m1[~i], n[~i], w[~i]):
            # get the support
            m2 = N - m1
            xl, xu = support(N, m1, n, w)
            x = np.arange(xl, xu + 1)

            # calculate sum of pmf over the support
            # the naive implementation is very wrong in these cases
            assert pmf(x, N, m1, n, w).sum() < .5
            assert_allclose(nchypergeom_wallenius.pmf(x, N, m1, n, w).sum(), 1)
示例#31
0
class TestNCH():
    np.random.seed(2)  # seeds 0 and 1 had some xl = xu; randint failed
    shape = (2, 4, 3)
    max_m = 100
    m1 = np.random.randint(1, max_m, size=shape)  # red balls
    m2 = np.random.randint(1, max_m, size=shape)  # white balls
    N = m1 + m2  # total balls
    n = randint.rvs(0, N, size=N.shape)  # number of draws
    xl = np.maximum(0, n - m2)  # lower bound of support
    xu = np.minimum(n, m1)  # upper bound of support
    x = randint.rvs(xl, xu, size=xl.shape)
    odds = np.random.rand(*x.shape) * 2

    # test output is more readable when function names (strings) are passed
    @pytest.mark.parametrize('dist_name',
                             ['nchypergeom_fisher', 'nchypergeom_wallenius'])
    def test_nch_hypergeom(self, dist_name):
        # Both noncentral hypergeometric distributions reduce to the
        # hypergeometric distribution when odds = 1
        dists = {
            'nchypergeom_fisher': nchypergeom_fisher,
            'nchypergeom_wallenius': nchypergeom_wallenius
        }
        dist = dists[dist_name]
        x, N, m1, n = self.x, self.N, self.m1, self.n
        assert_allclose(dist.pmf(x, N, m1, n, odds=1),
                        hypergeom.pmf(x, N, m1, n))

    def test_nchypergeom_fisher_naive(self):
        # test against a very simple implementation
        x, N, m1, n, odds = self.x, self.N, self.m1, self.n, self.odds

        @np.vectorize
        def pmf_mean_var(x, N, m1, n, w):
            # simple implementation of nchypergeom_fisher pmf
            m2 = N - m1
            xl = np.maximum(0, n - m2)
            xu = np.minimum(n, m1)

            def f(x):
                t1 = special_binom(m1, x)
                t2 = special_binom(m2, n - x)
                return t1 * t2 * w**x

            def P(k):
                return sum((f(y) * y**k for y in range(xl, xu + 1)))

            P0 = P(0)
            P1 = P(1)
            P2 = P(2)
            pmf = f(x) / P0
            mean = P1 / P0
            var = P2 / P0 - (P1 / P0)**2
            return pmf, mean, var

        pmf, mean, var = pmf_mean_var(x, N, m1, n, odds)
        assert_allclose(nchypergeom_fisher.pmf(x, N, m1, n, odds), pmf)
        assert_allclose(nchypergeom_fisher.stats(N, m1, n, odds, moments='m'),
                        mean)
        assert_allclose(nchypergeom_fisher.stats(N, m1, n, odds, moments='v'),
                        var)

    def test_nchypergeom_wallenius_naive(self):
        # test against a very simple implementation

        np.random.seed(2)
        shape = (2, 4, 3)
        max_m = 100
        m1 = np.random.randint(1, max_m, size=shape)
        m2 = np.random.randint(1, max_m, size=shape)
        N = m1 + m2
        n = randint.rvs(0, N, size=N.shape)
        xl = np.maximum(0, n - m2)
        xu = np.minimum(n, m1)
        x = randint.rvs(xl, xu, size=xl.shape)
        w = np.random.rand(*x.shape) * 2

        def support(N, m1, n, w):
            m2 = N - m1
            xl = np.maximum(0, n - m2)
            xu = np.minimum(n, m1)
            return xl, xu

        @np.vectorize
        def mean(N, m1, n, w):
            m2 = N - m1
            xl, xu = support(N, m1, n, w)

            def fun(u):
                return u / m1 + (1 - (n - u) / m2)**w - 1

            return root_scalar(fun, bracket=(xl, xu)).root

        assert_allclose(nchypergeom_wallenius.mean(N, m1, n, w),
                        mean(N, m1, n, w),
                        rtol=2e-2)

        @np.vectorize
        def variance(N, m1, n, w):
            m2 = N - m1
            u = mean(N, m1, n, w)
            a = u * (m1 - u)
            b = (n - u) * (u + m2 - n)
            return N * a * b / ((N - 1) * (m1 * b + m2 * a))

        assert_allclose(nchypergeom_wallenius.stats(N, m1, n, w, moments='v'),
                        variance(N, m1, n, w),
                        rtol=5e-2)

        @np.vectorize
        def pmf(x, N, m1, n, w):
            m2 = N - m1
            xl, xu = support(N, m1, n, w)

            def integrand(t):
                D = w * (m1 - x) + (m2 - (n - x))
                res = (1 - t**(w / D))**x * (1 - t**(1 / D))**(n - x)
                return res

            def f(x):
                t1 = special_binom(m1, x)
                t2 = special_binom(m2, n - x)
                the_integral = quad(integrand,
                                    0,
                                    1,
                                    epsrel=1e-16,
                                    epsabs=1e-16)
                return t1 * t2 * the_integral[0]

            return f(x)

        pmf0 = pmf(x, N, m1, n, w)
        pmf1 = nchypergeom_wallenius.pmf(x, N, m1, n, w)

        atol, rtol = 1e-6, 1e-6
        i = np.abs(pmf1 - pmf0) < atol + rtol * np.abs(pmf0)
        assert (i.sum() > np.prod(shape) / 2)  # works at least half the time

        # for those that fail, discredit the naive implementation
        for N, m1, n, w in zip(N[~i], m1[~i], n[~i], w[~i]):
            # get the support
            m2 = N - m1
            xl, xu = support(N, m1, n, w)
            x = np.arange(xl, xu + 1)

            # calculate sum of pmf over the support
            # the naive implementation is very wrong in these cases
            assert pmf(x, N, m1, n, w).sum() < .5
            assert_allclose(nchypergeom_wallenius.pmf(x, N, m1, n, w).sum(), 1)

    def test_wallenius_against_mpmath(self):
        # precompute data with mpmath since naive implementation above
        # is not reliable. See source code in gh-13330.
        M = 50
        n = 30
        N = 20
        odds = 2.25
        # Expected results, computed with mpmath.
        sup = np.arange(21)
        pmf = np.array([
            3.699003068656875e-20, 5.89398584245431e-17,
            2.1594437742911123e-14, 3.221458044649955e-12,
            2.4658279241205077e-10, 1.0965862603981212e-08,
            3.057890479665704e-07, 5.622818831643761e-06,
            7.056482841531681e-05, 0.000618899425358671, 0.003854172932571669,
            0.01720592676256026, 0.05528844897093792, 0.12772363313574242,
            0.21065898367825722, 0.24465958845359234, 0.1955114898110033,
            0.10355390084949237, 0.03414490375225675, 0.006231989845775931,
            0.0004715577304677075
        ])
        mean = 14.808018384813426
        var = 2.6085975877923717

        # nchypergeom_wallenius.pmf returns 0 for pmf(0) and pmf(1), and pmf(2)
        # has only three digits of accuracy (~ 2.1511e-14).
        assert_allclose(nchypergeom_wallenius.pmf(sup, M, n, N, odds),
                        pmf,
                        rtol=1e-13,
                        atol=1e-13)
        assert_allclose(nchypergeom_wallenius.mean(M, n, N, odds),
                        mean,
                        rtol=1e-13)
        assert_allclose(nchypergeom_wallenius.var(M, n, N, odds),
                        var,
                        rtol=1e-11)

    @pytest.mark.parametrize('dist_name',
                             ['nchypergeom_fisher', 'nchypergeom_wallenius'])
    def test_rvs_shape(self, dist_name):
        # Check that when given a size with more dimensions than the
        # dimensions of the broadcast parameters, rvs returns an array
        # with the correct shape.
        dists = {
            'nchypergeom_fisher': nchypergeom_fisher,
            'nchypergeom_wallenius': nchypergeom_wallenius
        }
        dist = dists[dist_name]
        x = dist.rvs(50, 30, [[10], [20]], [0.5, 1.0, 2.0], size=(5, 1, 2, 3))
        assert x.shape == (5, 1, 2, 3)
示例#32
0
from scipy.stats import randint

def plot_1D_function(x, y, y_name='y'):
        ax = plt.subplot(111)
        ax.plot(x, y, y_name)
        plt.legend(loc='best')
        plt.show()

a_1 = np.linspace(0,10,100)
a_2 = np.linspace(0,10,100)
b_1 = np.linspace(0,10,100)
b_2 = np.linspace(0,10,100)
pi = np.linspace(0,1,10)
input_space = np.linspace(0,1,1000)
for i in range(5):
	pi_rvs = randint.rvs(0,10)
	a_1_rvs = randint.rvs(0,100)
	a_2_rvs = randint.rvs(0,100)
	b_1_rvs = randint.rvs(0,100)
	b_2_rvs = randint.rvs(0,100)
	bibeta_example_pdf = pi[pi_rvs]*beta.pdf(input_space,a_1[a_1_rvs],b_1[b_1_rvs]) + (1-pi[pi_rvs])*beta.pdf(input_space,a_2[a_2_rvs],b_2[b_2_rvs])
	bibeta_example_cdf = pi[pi_rvs]*beta.cdf(input_space,a_1[a_1_rvs],b_1[b_1_rvs]) + (1-pi[pi_rvs])*beta.cdf(input_space,a_2[a_2_rvs],b_2[b_2_rvs])
	ax = plt.subplot(111)
        ax.plot(input_space, pi[pi_rvs]*beta.pdf(input_space,a_1[a_1_rvs],b_1[b_1_rvs]), label="1 comp pdf")
        ax.plot(input_space, (1-pi[pi_rvs])*beta.pdf(input_space,a_2[a_2_rvs],b_2[b_2_rvs]), label="2 comp pdf")
        ax.plot(input_space, pi[pi_rvs]*beta.pdf(input_space,a_1[a_1_rvs],b_1[b_1_rvs]) + (1-pi[pi_rvs])*beta.pdf(input_space,a_2[a_2_rvs],b_2[b_2_rvs]), label="Mix pdf")
	plt.legend(loc='best')
        plt.show()
	ax = plt.subplot(111)
	ax.plot(input_space, pi[pi_rvs]*beta.cdf(input_space,a_1[a_1_rvs],b_1[b_1_rvs]), label="1 comp cdf")
	ax.plot(input_space, (1-pi[pi_rvs])*beta.cdf(input_space,a_2[a_2_rvs],b_2[b_2_rvs]), label="2 comp cdf")
示例#33
0
def get_fake_output_df(n_columns):
    columns = [
        'sw_version',

        'ct_identifier',
        'equipment_type',
        'heating_or_cooling',
        'station',
        'zipcode',
        'climate_zone',

        'start_date',
        'end_date',

        'n_days_in_inputfile_date_range',
        'n_days_both_heating_and_cooling',
        'n_days_insufficient_data',
        'n_core_cooling_days',
        'n_core_heating_days',

        'baseline_percentile_core_cooling_comfort_temperature',
        'baseline_percentile_core_heating_comfort_temperature',
        'regional_average_baseline_cooling_comfort_temperature',
        'regional_average_baseline_heating_comfort_temperature',

        'percent_savings_baseline_percentile',
        'avoided_daily_mean_core_day_runtime_baseline_percentile',
        'avoided_total_core_day_runtime_baseline_percentile',
        'baseline_daily_mean_core_day_runtime_baseline_percentile',
        'baseline_total_core_day_runtime_baseline_percentile',
        '_daily_mean_core_day_demand_baseline_baseline_percentile',
        'percent_savings_baseline_regional',
        'avoided_daily_mean_core_day_runtime_baseline_regional',
        'avoided_total_core_day_runtime_baseline_regional',
        'baseline_daily_mean_core_day_runtime_baseline_regional',
        'baseline_total_core_day_runtime_baseline_regional',
        '_daily_mean_core_day_demand_baseline_baseline_regional',
        'mean_demand',
        'alpha',
        'tau',
        'mean_sq_err',
        'root_mean_sq_err',
        'cv_root_mean_sq_err',
        'mean_abs_err',
        'mean_abs_pct_err',

        'total_core_cooling_runtime',
        'total_core_heating_runtime',
        'total_auxiliary_heating_core_day_runtime',
        'total_emergency_heating_core_day_runtime',

        'daily_mean_core_cooling_runtime',
        'daily_mean_core_heating_runtime',

        'rhu_00F_to_05F',
        'rhu_05F_to_10F',
        'rhu_10F_to_15F',
        'rhu_15F_to_20F',
        'rhu_20F_to_25F',
        'rhu_25F_to_30F',
        'rhu_30F_to_35F',
        'rhu_35F_to_40F',
        'rhu_40F_to_45F',
        'rhu_45F_to_50F',
        'rhu_50F_to_55F',
        'rhu_55F_to_60F',
    ]

    string_placeholder = ["PLACEHOLDER"] * n_columns
    zero_column = [0 if randint.rvs(0, 30) > 0 else  (None if randint.rvs(0, 2) > 0 else np.inf)
                  for i in randint.rvs(0, 1, size=n_columns)]
    one_column = [1 if randint.rvs(0, 30) > 0 else  (None if randint.rvs(0, 2) > 0 else np.inf)
                  for i in randint.rvs(0, 1, size=n_columns)]
    float_column = [i if randint.rvs(0, 30) > 0 else (None if randint.rvs(0, 2) > 0 else np.inf)
                    for i in norm.rvs(size=n_columns)]
    zipcodes = ["01234", "12345", "23456", "34567", "43210", "54321", "65432", "76543"]
    zipcode_column = [i for i in islice(cycle(zipcodes), None, n_columns)]
    core_day_set_names = ["cooling_2012", "heating_2012-2013", "cooling_2013"]
    core_day_set_name_column = [i for i in islice(cycle(core_day_set_names), None, n_columns)]

    data = {
        'sw_version': string_placeholder,

        'ct_identifier': string_placeholder,
        'equipment_type': string_placeholder,
        'heating_or_cooling': core_day_set_name_column,
        'station': string_placeholder,
        'zipcode': zipcode_column,
        'climate_zone': string_placeholder,

        'start_date': datetime(2011, 1, 1),
        'end_date': datetime(2012, 1, 1),
        'n_days_both_heating_and_cooling': one_column,
        'n_days_in_inputfile_date_range': one_column,
        'n_days_insufficient_data': zero_column,
        'n_core_heating_days': one_column,

        'baseline_percentile_core_cooling_comfort_temperature': float_column,
        'baseline_percentile_core_heating_comfort_temperature': float_column,
        'regional_average_baseline_cooling_comfort_temperature': float_column,
        'regional_average_baseline_heating_comfort_temperature': float_column,

        'percent_savings_baseline_percentile': float_column,
        'avoided_daily_mean_core_day_runtime_baseline_percentile': float_column,
        'avoided_total_core_day_runtime_baseline_percentile': float_column,
        'baseline_daily_mean_core_day_runtime_baseline_percentile': float_column,
        'baseline_total_core_day_runtime_baseline_percentile': float_column,
        '_daily_mean_core_day_demand_baseline_baseline_percentile': float_column,
        'percent_savings_baseline_regional': float_column,
        'avoided_daily_mean_core_day_runtime_baseline_regional': float_column,
        'avoided_total_core_day_runtime_baseline_regional': float_column,
        'baseline_daily_mean_core_day_runtime_baseline_regional': float_column,
        'baseline_total_core_day_runtime_baseline_regional': float_column,
        '_daily_mean_core_day_demand_baseline_baseline_regional': float_column,
        'mean_demand': float_column,
        'alpha': float_column,
        'tau': float_column,
        'mean_sq_err': float_column,
        'root_mean_sq_err': float_column,
        'cv_root_mean_sq_err': float_column,
        'mean_abs_err': float_column,
        'mean_abs_pct_err': float_column,

        'total_core_cooling_runtime': float_column,
        'total_core_heating_runtime': float_column,
        'total_auxiliary_heating_core_day_runtime': float_column,
        'total_emergency_heating_core_day_runtime': float_column,

        'daily_mean_core_cooling_runtime': float_column,
        'daily_mean_core_heating_runtime': float_column,

        'rhu_00F_to_05F': float_column,
        'rhu_05F_to_10F': float_column,
        'rhu_10F_to_15F': float_column,
        'rhu_15F_to_20F': float_column,
        'rhu_20F_to_25F': float_column,
        'rhu_25F_to_30F': float_column,
        'rhu_30F_to_35F': float_column,
        'rhu_35F_to_40F': float_column,
        'rhu_40F_to_45F': float_column,
        'rhu_45F_to_50F': float_column,
        'rhu_50F_to_55F': float_column,
        'rhu_55F_to_60F': float_column,
    }
    df = pd.DataFrame(data, columns=columns)
    return df
示例#34
0
def get_fake_output_df(n_columns):
    columns = [
        'ct_identifier',
        'equipment_type',
        'season_name',
        'station',
        'zipcode',

        'n_days_in_season_range',
        'n_days_in_season',
        'n_days_both_heating_and_cooling',
        'n_days_insufficient_data',

        'seasonal_savings_dailyavgCDD',
        'seasonal_savings_dailyavgHDD',
        'seasonal_savings_deltaT',
        'seasonal_savings_hourlyavgCDD',
        'seasonal_savings_hourlyavgHDD',

        'seasonal_avoided_runtime_dailyavgCDD',
        'seasonal_avoided_runtime_dailyavgHDD',
        'seasonal_avoided_runtime_deltaT',
        'seasonal_avoided_runtime_hourlyavgCDD',
        'seasonal_avoided_runtime_hourlyavgHDD',

        'total_auxiliary_heating_runtime',
        'total_cooling_runtime',
        'total_emergency_heating_runtime',
        'total_heating_runtime',

        'actual_daily_runtime',
        'actual_seasonal_runtime',

        'baseline_comfort_temperature',

        'baseline_daily_runtime_dailyavgCDD',
        'baseline_daily_runtime_dailyavgHDD',
        'baseline_daily_runtime_deltaT',
        'baseline_daily_runtime_hourlyavgCDD',
        'baseline_daily_runtime_hourlyavgHDD',

        'baseline_seasonal_runtime_dailyavgCDD',
        'baseline_seasonal_runtime_dailyavgHDD',
        'baseline_seasonal_runtime_deltaT',
        'baseline_seasonal_runtime_hourlyavgCDD',
        'baseline_seasonal_runtime_hourlyavgHDD',

        'mean_demand_dailyavgCDD',
        'mean_demand_dailyavgHDD',
        'mean_demand_deltaT',
        'mean_demand_hourlyavgCDD',
        'mean_demand_hourlyavgHDD',

        'mean_demand_baseline_dailyavgCDD',
        'mean_demand_baseline_dailyavgHDD',
        'mean_demand_baseline_deltaT',
        'mean_demand_baseline_hourlyavgCDD',
        'mean_demand_baseline_hourlyavgHDD',

        'rhu_00F_to_05F',
        'rhu_05F_to_10F',
        'rhu_10F_to_15F',
        'rhu_15F_to_20F',
        'rhu_20F_to_25F',
        'rhu_25F_to_30F',
        'rhu_30F_to_35F',
        'rhu_35F_to_40F',
        'rhu_40F_to_45F',
        'rhu_45F_to_50F',
        'rhu_50F_to_55F',
        'rhu_55F_to_60F',

        'slope_deltaT',
        'alpha_est_dailyavgCDD',
        'alpha_est_dailyavgHDD',
        'alpha_est_hourlyavgCDD',
        'alpha_est_hourlyavgHDD',

        'intercept_deltaT',
        'deltaT_base_est_dailyavgCDD',
        'deltaT_base_est_dailyavgHDD',
        'deltaT_base_est_hourlyavgCDD',
        'deltaT_base_est_hourlyavgHDD',

        'mean_sq_err_dailyavgCDD',
        'mean_sq_err_dailyavgHDD',
        'mean_sq_err_deltaT',
        'mean_sq_err_hourlyavgCDD',
        'mean_sq_err_hourlyavgHDD',

        'root_mean_sq_err_dailyavgCDD',
        'root_mean_sq_err_dailyavgHDD',
        'root_mean_sq_err_deltaT',
        'root_mean_sq_err_hourlyavgCDD',
        'root_mean_sq_err_hourlyavgHDD',

        'cv_root_mean_sq_err_dailyavgCDD',
        'cv_root_mean_sq_err_dailyavgHDD',
        'cv_root_mean_sq_err_deltaT',
        'cv_root_mean_sq_err_hourlyavgCDD',
        'cv_root_mean_sq_err_hourlyavgHDD',

        'mean_abs_err_dailyavgCDD',
        'mean_abs_err_dailyavgHDD',
        'mean_abs_err_deltaT',
        'mean_abs_err_hourlyavgCDD',
        'mean_abs_err_hourlyavgHDD',

        'mean_abs_pct_err_dailyavgCDD',
        'mean_abs_pct_err_dailyavgHDD',
        'mean_abs_pct_err_deltaT',
        'mean_abs_pct_err_hourlyavgCDD',
        'mean_abs_pct_err_hourlyavgHDD',
    ]

    string_placeholder = ["PLACEHOLDER"] * n_columns
    zero_column = [0 if randint.rvs(0, 30) > 0 else  (None if randint.rvs(0, 2) > 0 else np.inf)
                  for i in randint.rvs(0, 1, size=n_columns)]
    one_column = [1 if randint.rvs(0, 30) > 0 else  (None if randint.rvs(0, 2) > 0 else np.inf)
                  for i in randint.rvs(0, 1, size=n_columns)]
    float_column = [i if randint.rvs(0, 30) > 0 else (None if randint.rvs(0, 2) > 0 else np.inf)
                    for i in norm.rvs(size=n_columns)]
    zipcodes = ["01234", "12345", "23456", "34567", "43210", "54321", "65432", "76543"]
    zipcode_column = [i for i in islice(cycle(zipcodes), None, n_columns)]
    season_names = ["Cooling 2012", "Heating 2012-2013", "Cooling 2013"]
    season_name_column = [i for i in islice(cycle(season_names), None, n_columns)]
    data = {
        "ct_identifier": string_placeholder,
        "equipment_type": string_placeholder,
        "season_name": season_name_column,
        "station": string_placeholder,
        "zipcode": zipcode_column,

        "n_days_both_heating_and_cooling": one_column,
        "n_days_in_season": one_column,
        "n_days_in_season_range": one_column,
        "n_days_insufficient_data": zero_column,

        "seasonal_savings_deltaT": float_column,
        "seasonal_savings_dailyavgCDD": float_column,
        "seasonal_savings_dailyavgHDD": float_column,
        "seasonal_savings_hourlyavgCDD": float_column,
        "seasonal_savings_hourlyavgHDD": float_column,

        "seasonal_avoided_runtime_deltaT": float_column,
        "seasonal_avoided_runtime_dailyavgCDD": float_column,
        "seasonal_avoided_runtime_dailyavgHDD": float_column,
        "seasonal_avoided_runtime_hourlyavgCDD": float_column,
        "seasonal_avoided_runtime_hourlyavgHDD": float_column,

        "total_heating_runtime": float_column,
        "total_cooling_runtime": float_column,
        "total_auxiliary_heating_runtime": float_column,
        "total_emergency_heating_runtime": float_column,

        "actual_daily_runtime": float_column,
        "actual_seasonal_runtime": float_column,

        "baseline_comfort_temperature": float_column,

        "baseline_daily_runtime_deltaT": float_column,
        "baseline_daily_runtime_dailyavgCDD": float_column,
        "baseline_daily_runtime_dailyavgHDD": float_column,
        "baseline_daily_runtime_hourlyavgCDD": float_column,
        "baseline_daily_runtime_hourlyavgHDD": float_column,

        "baseline_seasonal_runtime_deltaT": float_column,
        "baseline_seasonal_runtime_dailyavgCDD": float_column,
        "baseline_seasonal_runtime_dailyavgHDD": float_column,
        "baseline_seasonal_runtime_hourlyavgCDD": float_column,
        "baseline_seasonal_runtime_hourlyavgHDD": float_column,

        "mean_demand_dailyavgCDD": float_column,
        "mean_demand_dailyavgHDD": float_column,
        "mean_demand_deltaT": float_column,
        "mean_demand_hourlyavgCDD": float_column,
        "mean_demand_hourlyavgHDD": float_column,

        "mean_demand_baseline_dailyavgCDD": float_column,
        "mean_demand_baseline_dailyavgHDD": float_column,
        "mean_demand_baseline_deltaT": float_column,
        "mean_demand_baseline_hourlyavgCDD": float_column,
        "mean_demand_baseline_hourlyavgHDD": float_column,

        "rhu_00F_to_05F": float_column,
        "rhu_05F_to_10F": float_column,
        "rhu_10F_to_15F": float_column,
        "rhu_15F_to_20F": float_column,
        "rhu_20F_to_25F": float_column,
        "rhu_25F_to_30F": float_column,
        "rhu_30F_to_35F": float_column,
        "rhu_35F_to_40F": float_column,
        "rhu_40F_to_45F": float_column,
        "rhu_45F_to_50F": float_column,
        "rhu_50F_to_55F": float_column,
        "rhu_55F_to_60F": float_column,

        "slope_deltaT": float_column,
        "alpha_est_dailyavgCDD": float_column,
        "alpha_est_dailyavgHDD": float_column,
        "alpha_est_hourlyavgCDD": float_column,
        "alpha_est_hourlyavgHDD": float_column,

        "intercept_deltaT": float_column,
        "deltaT_base_est_dailyavgCDD": float_column,
        "deltaT_base_est_dailyavgHDD": float_column,
        "deltaT_base_est_hourlyavgCDD": float_column,
        "deltaT_base_est_hourlyavgHDD": float_column,

        "mean_sq_err_dailyavgCDD": float_column,
        "mean_sq_err_dailyavgHDD": float_column,
        "mean_sq_err_deltaT": float_column,
        "mean_sq_err_hourlyavgCDD": float_column,
        "mean_sq_err_hourlyavgHDD": float_column,

        "root_mean_sq_err_dailyavgCDD": float_column,
        "root_mean_sq_err_dailyavgHDD": float_column,
        "root_mean_sq_err_deltaT": float_column,
        "root_mean_sq_err_hourlyavgCDD": float_column,
        "root_mean_sq_err_hourlyavgHDD": float_column,

        "cv_root_mean_sq_err_dailyavgCDD": float_column,
        "cv_root_mean_sq_err_dailyavgHDD": float_column,
        "cv_root_mean_sq_err_deltaT": float_column,
        "cv_root_mean_sq_err_hourlyavgCDD": float_column,
        "cv_root_mean_sq_err_hourlyavgHDD": float_column,

        "mean_abs_err_dailyavgCDD": float_column,
        "mean_abs_err_dailyavgHDD": float_column,
        "mean_abs_err_deltaT": float_column,
        "mean_abs_err_hourlyavgCDD": float_column,
        "mean_abs_err_hourlyavgHDD": float_column,

        "mean_abs_pct_err_dailyavgCDD": float_column,
        "mean_abs_pct_err_dailyavgHDD": float_column,
        "mean_abs_pct_err_deltaT": float_column,
        "mean_abs_pct_err_hourlyavgCDD": float_column,
        "mean_abs_pct_err_hourlyavgHDD": float_column,

    }
    df = pd.DataFrame(data, columns=columns)
    return df
示例#35
0
import matplotlib.pyplot as plt

from scipy.stats import randint
import numpy as np


# normal
distribution = scipy.stats.norm(loc=100, scale=5)
print distribution.stats("mvsk")
# skewed
distribution = scipy.stats.gengamma(100, 90, loc=50, scale=10)
print distribution.stats("mvsk")

sample = distribution.rvs(size=10000)

sample = randint.rvs(0, 208, size=1000)


pers = np.arange(1, 101, 1)

# Make each of the last 41 elements 5x more likely
prob = [1.0] * (len(pers) - 41) + [5.0] * 41

# Normalising to 1.0
prob /= np.sum(prob)

sample = np.random.choice(pers, 1000, p=prob)

plt.hist(sample)
plt.show()
def z_enrichment_test(node_pvals_dict, the_grouping_dict, **kwargs):
    """ Perform enrichment analysis on the groupings in the_grouping_dict, using
     statistical aggregation based on the z-scores.  Note this is similar to
     the statistical subnetwork scoring system used in:

     Ideker, T., Ozier, O., Schwikowski, B., & Siegel, A. (2002). 
     Discovering regulatory and signalling circuits in molecular interaction networks. 
     Bioinformatics, 18, 233–240.

     Except here we use the pre-defined groupings in the_grouping_dict rather than
     scanning for novel subnetworks.

    Arguments:
     node_pvals_dict: must have a subdictionary of node_id:
      'p_uncorrected': (or 'p').  Note the p-value should be from
                      a two-tailed test for changes.
      't': optional, to deal with two-tailedness if doing signed z aggregation)
      p-values are assumed to result from two-tail tests and span [0 - 1].
      'z' is optional, if 'use_type' = z then it is needed
     the_grouping_dict: a dict of subsystem_id: [node_id_1, node_id_2, ...] 

    kwargs:
     navg_node_sample: default is 100.0.  The groupings are randomized 
      such that each gene is sampled an
      average of 100 times. 
     diagnostic: [False (default), True]: if True, a list of the randomly 
                 generated p-values will also be returned.
     aggregation_type: options are 'signed' or 'unsigned'
      'unsigned': the z-value ranges from 0 to + inf.  This method 
                  picks out gross changes in subsystems and ignored 
                  whether they are increasing or decreasing.
      'signed': the z-value ranges from -inf to +inf.  This method picks out
                coordinated changes in subsystems.

    Returns: 
     grouping_scores_dict with keys
      agg_z = the aggregated z-value without background correction
      agg_adj_z = the aggregated z-value with background correction
      agg_p = a p-value resulting from a two-tail test for changes 
       (e.g. near 0 is more significant) assuming the agg_adj_z is truly
       normally distributed.
     if diagnostic = True, random_scores_to_return is also returned.


     
    """
    
    from numpy import nan, sign, mean, std, array, inf, zeros
    from numpy.random import rand
    from copy import deepcopy
    from random import sample, shuffle
    from scipy.stats import norm, randint
    
    diagnostic = test_kwarg('diagnostic', kwargs, [False, True])
    aggregation_type = test_kwarg('aggregation_type', kwargs, ['unsigned', 'signed'])

    if 'navg_node_sample' in kwargs: 
        navg_node_sample = kwargs['navg_node_sample']
    else:
        navg_node_sample = 100.0

    grouping_scores_dict = {}
        
    node_pvals_dictl = deepcopy(node_pvals_dict)
    random_scores_to_return = {}
    # First get the subsystems
    for subsystem in the_grouping_dict.keys():
        # Enforce lower case to avoid duplication
        subsystem = subsystem.lower()
        if not(grouping_scores_dict.has_key(subsystem)):
            grouping_scores_dict[subsystem] = {}
            grouping_scores_dict[subsystem]['ind_p'] = []
            if aggregation_type == 'signed':
                grouping_scores_dict[subsystem]['ind_t'] = []
            grouping_scores_dict[subsystem]['agg_p'] = nan
            grouping_scores_dict[subsystem]['agg_z'] = nan
            grouping_scores_dict[subsystem]['agg_adj_z'] = nan
            grouping_scores_dict[subsystem]['ind_node'] = []
    # Now add in pvals
    # Give preference to uncorrected p-values since Bonferroni corrected values
    # are truncated at 1.  Don't need a multiple testing correction
    # since we are looking across the subsystem and correcting
    # would potentially lose information in detected differences.
    # We re-normalize p-values for subnetworks according to an 
    # empirical null distribution at the end.            
    test_node = node_pvals_dictl.keys()[0]
    if node_pvals_dictl[test_node].has_key('p_uncorrected'):
        p_key = 'p_uncorrected'
    else:
        p_key = 'p'

    for subsystem in the_grouping_dict.keys():
        subsystem_lower = subsystem.lower()
        for the_node in the_grouping_dict[subsystem]:
            if the_node in node_pvals_dictl.keys():
                grouping_scores_dict[subsystem_lower]['ind_p'].append(node_pvals_dictl[the_node][p_key])
                grouping_scores_dict[subsystem_lower]['ind_node'].append(the_node)
            if aggregation_type == 'signed':
                grouping_scores_dict[subsystem_lower]['ind_t'].append(node_pvals_dictl[the_node]['t'])

    # Now aggregate.  Make a lookuptable of size vs p values
    maxk = 0
    for subsystem in grouping_scores_dict:
        if len(grouping_scores_dict[subsystem]['ind_p']) > maxk:
            maxk = len(grouping_scores_dict[subsystem]['ind_p'])
    meanlookup = []
    sdlookup = []
    # Haven't pressure tested the window with even numbers, 
    # but odd values make more sense anyway.
    windowsize = 5
    maxsize =  int(maxk + 1 + int(round((windowsize-1)/2)))
    # It can be slow to compute system statistics, 
    # so we need to be selective and just evaluate
    # around the sample sizes of interest
    k_to_evaluate = []
    for subsystem in grouping_scores_dict:
        k = len(grouping_scores_dict[subsystem]['ind_p'])
        if k > 0:
            k = list(range(max(1,(k-(windowsize-1)/2)),(k+((windowsize-1)/2)+1)))
            k_to_evaluate.extend(deepcopy(k))
            k_to_evaluate = list(set(k_to_evaluate))
            k_to_evaluate.sort()

    # to speed calculations pre-convert to a z-score
    node_list = node_pvals_dictl.keys()
    pval_list = [node_pvals_dictl[curnode][p_key] for curnode in node_list]

    # Want to replace 0 or 1 pvals, 
    # use the next nearest value
    filter_pval_list = [x for x in pval_list if ((x > 0) & (x < 1))]
    min_val = min(filter_pval_list)
    max_val = min(filter_pval_list)
    for i, x in enumerate(pval_list):
        if x >= 1:
            pval_list[i] = max_val
        if x <= 0:
            pval_list[i] = min_val

    if aggregation_type == 'signed':
        # Here, we aggregate using p-values
        # resulting from one-tailed tests
        # where p = 0.5 means no change and
        # decreases in expression imply negative z
        # when aggregating.
        # Convert our z first then take the sign
        # to minimize numerical issues.
        # This method is equivalent to
        # Stouffer's method.
        print "Warning, verify assumptions for signed averaging, this has not been done in a while."
        zval_list = norm.ppf(pval_list)
        tval_list = [node_pvals_dictl[curnode]['t'] for curnode in node_list]    
        for index, t_val in enumerate(tval_list):
            if t_val > 0:
                zval_list[index] = -1 * zval_list[index]
    else:
        # Ideker 2002 and also Patil 2005 use an
        # undirected p-value when aggregating
        # Z-scores for p-values - e.g. they
        # use the "significance of the change"
        # where more negative z corresponds to 
		# p ~ 1 and little change.
        zval_list = -1 * norm.ppf(pval_list)        
    
    for k in k_to_evaluate:
        print('Simulating measures for subsystem number '+ str(k_to_evaluate.index(k)+1) + ' of '+ str(len(k_to_evaluate)) + '.')
        r_z_values = []
        # Should need more trials with small k.
        # Rule of thumb: set size so all model 
        # genes are sampled on the average > 7x
        # This and windowsize = 5 seem to be result in 
        # fairly stable statistics from trial-and-error.
        ntrials = int(round(navg_node_sample*float(len(node_pvals_dictl))/float(k)))

        # Generate random indices between 0 and nmeasures - 1, as an array of size ntrials rows and k columns
        the_random_indices = randint.rvs(0, len(node_list), size=(ntrials, k))
        # Faster to do this here as an array operation than call stouffer_z_agg
        random_score_distribution = array([sum(zval_list[x]) for x in list(the_random_indices)]) /(k**0.5)
        meanlookup.append(mean(random_score_distribution))
        # Note SD's defined by this method are approximately size-independent
        sdlookup.append(std(random_score_distribution))
        if diagnostic:
            random_score_distribution.sort()
            random_scores_to_return[k] = random_score_distribution

    # The SD as assessed here should be independent of size
    # apply a smoothing filter here first
    sdlookup = list(smooth(array(sdlookup), window_len=windowsize))

    # The mean will be dependent on k**.5; To avoid edge effects
    # of the window first normalize then apply the smoothing filter
    for k in k_to_evaluate:
        meanlookup[k_to_evaluate.index(k)] = meanlookup[k_to_evaluate.index(k)] / (k ** 0.5)

    meanlookup = list(smooth(array(meanlookup), window_len=windowsize))

    for k in k_to_evaluate:
        meanlookup[k_to_evaluate.index(k)] = meanlookup[k_to_evaluate.index(k)] * (k ** 0.5)

    # Re-normalize the mean before averaging    
    for subsystem in grouping_scores_dict:
        k = len(grouping_scores_dict[subsystem]['ind_p'])
        if k > 0:
            node_indices = [index for index, node in enumerate(node_list) if node in grouping_scores_dict[subsystem]['ind_node']]
            grouping_scores_dict[subsystem]['agg_z'] = sum([zval_list[index] for index in node_indices]) / (k**0.5)
            grouping_scores_dict[subsystem]['agg_adj_z'] = (grouping_scores_dict[subsystem]['agg_z'] - meanlookup[k_to_evaluate.index(k)] )/ sdlookup[k_to_evaluate.index(k)]
            grouping_scores_dict[subsystem]['agg_p'] = norm.cdf(grouping_scores_dict[subsystem]['agg_adj_z'])
            # Convert back to a two-sided p value, this is twice the one-sided value
            if aggregation_type == 'signed':
                if grouping_scores_dict[subsystem]['agg_p'] < .5:
                    grouping_scores_dict[subsystem]['agg_p'] = 2*grouping_scores_dict[subsystem]['agg_p']
                else:
                    grouping_scores_dict[subsystem]['agg_p'] = 2*(1-grouping_scores_dict[subsystem]['agg_p'])
            else:
                grouping_scores_dict[subsystem]['agg_p'] = 1- grouping_scores_dict[subsystem]['agg_p']
            if diagnostic:
                grouping_scores_dict[subsystem]['z'] = norm.cdf(k_to_evaluate.index(k))

    if not diagnostic:
        return grouping_scores_dict
    else:
        return grouping_scores_dict, random_scores_to_return
def distrib(param1,param2,param3,param4,egg):
	#param1 : list_element
	#param2 : config de la propriete
	#param3 : nom de la propriete
	#param4 : snapshot id
	#egg which is egg




	############################	
	# qualitatif :begin
	############################

	if param2['domain']['type']=="qualitatif":
		

		############################	
		# qualitatif sans ordre:begin
		############################	
		

		if param2['domain']['order']=="false":

			no_succ_elements=list()
			succ_elements=list()

			for  succ_key in param2["evolution"]["succesors"]:
				
				succ_elements.append(list())

			######################### division de l'ensemble des elements en plusieurs ensembles selon la regle de succession
				
			for param1_element in param1:

				succ_index=0
				bool_succ=False

				for succ_key in param2["evolution"]["succesors"]:

					value_pr=egg[param1_element][param3][param4-1]

					if succ_key == value_pr :

						succ_elements[succ_index].append(param1_element)

						bool_succ=True

					succ_index=succ_index+1		

				if bool_succ == False:

					no_succ_elements.append(param1_element)

			######################### end

			#########################  affectation des valeurs a no succ elements
			try:
				random =  list(randint.rvs(0, len(param2['domain']['values']), size=len(no_succ_elements)))
			except:
				print '\n***************\n*\n*	Error: Check the configuration of property: ',param3,'\n*\n***************'
			for elementId in no_succ_elements:

				egg[elementId][param3].insert(param4,param2['domain']['values'][random.pop()])

			######################### end

			######################### affectation des valeurs aux elements de succ_elements

			succ_index =0

			for  succ_key in param2["evolution"]["succesors"]:

				succ_list=succ_elements[succ_index]
				try:
					random =  list(randint.rvs(0, len(param2['evolution']['succesors'][succ_key]), size=len(succ_list)))
				except:
					print '\n***************\n*\n*	Error: Check the configuration of property: ',param3,'\n*\n***************'
				for elementId in succ_list:

					egg[elementId][param3].insert(param4,param2['evolution']['succesors'][succ_key][random.pop()])

				succ_index=succ_index+1

			######################### end

		############################	
		# qualitatif sans ordre:fin
		############################



		############################	
		# qualitatif avec ordre:begin
		############################
		
		else:


			offset_list=list()
			try:
				for m in range(0,param2['evolution']['offset']['max']-param2['evolution']['offset']['min']+1):
					### m ne va pas jusqu au bout
					offset_list.append(param2['evolution']['offset']['min'] + m)
			except:
				print '\n***************\n*\n*	Error: Check the configuration of property: ',param3,'\n*\n***************'
			############################	
			# uniform:begin
			############################
	
			if param2['evolution']['offset']['distribution']['type']=="uniform":

				try:
					random =  randint.rvs(0, len(offset_list), size=len(param1))
				except:
					print '\n***************\n*\n*	Error: Check the configuration of property: ',param3,'\n*\n***************'
				i=0
				for param1_element in param1 :

					value_pr=egg[param1_element][param3][param4-1] # value of the previous element, needed for offset
					indice=param2['domain']['values'].index(value_pr)

					if len(param2['domain']['values'])-1<indice+offset_list[random[i]]:########## enter here only when indice+offset_list[random[0]] is bigger than the biggest index
						egg[param1_element][param3].insert(param4,param2['domain']['values'][len(param2['domain']['values'])-1])##### we take the last value
					elif indice+offset_list[random[i]]<0:
						egg[param1_element][param3].insert(param4,param2['domain']['values'][0]) ######### we take the first value
					else:
						egg[param1_element][param3].insert(param4,param2['domain']['values'][indice+offset_list[random[i]]])	
					i=i+1
			############################	
			# uniform:end
			############################


			############################	
			# binom:begin
			############################
			if param2['evolution']['offset']['distribution']['type']=="binom":
				try:
					random =  binom.rvs(len(offset_list)-1,param2['evolution']['offset']['distribution']["p"] , size=len(param1))
				except:
					print '\n***************\n*\n*	Error: Check the configuration of property: ',param3,'\n*\n***************'
				i=0
				for param1_element in param1 :

					value_pr=egg[param1_element][param3][param4-1] # value of the previous element, needed for offset
					indice=param2['domain']['values'].index(value_pr)


					#logging.info( param3+param1+str(param4)+str(indice)+str(offset_list)+str(random[0]))
					if len(param2['domain']['values'])-1<indice+offset_list[random[i]]:########## enter here only when indice+offset_list[random[0]] is bigger than the biggest index
						egg[param1_element][param3].insert(param4,param2['domain']['values'][len(param2['domain']['values'])-1]) ##### we take the last value
					elif indice+offset_list[random[i]]<0:
						egg[param1_element][param3].insert(param4,param2['domain']['values'][0]) ######### we take the first value
					else:
						egg[param1_element][param3].insert(param4,param2['domain']['values'][indice+offset_list[random[i]]])
					i=i+1

			############################	
			# binom:end
			############################

		############################	
		# qualitatif avec ordre:end
		############################



	############################	
	# qualitatif :end
	############################






















	############################	
	# quantitatif:dis :begin
	############################

	if param2['domain']['type']=="quantitatif:dis":
		

		offset_list=list()
		try:
			for m in range(0,param2['evolution']['offset']['max']-param2['evolution']['offset']['min']+1):
				### jai ajoute 1 car il yavait un bug que je ne comprenais pas , m n allait pas jusqu au plus grand nombre
				offset_list.append(param2['evolution']['offset']['min'] + m)
		except:
			print '\n***************\n*\n*	Error: Check the configuration of property: ',param3,'\n*\n***************'

		try:
			random =  binom.rvs(len(offset_list)-1,param2['evolution']['offset']['distribution']["p"] , size=len(param1))
		except:
			print '\n***************\n*\n*	Error: Check the configuration of property: ',param3,'\n*\n***************'
		i=0
		for param1_element in param1 :

			value_pr=egg[param1_element][param3][param4-1] # value of the previous element, needed for offset
				
			### j ajoute -1 pour que random soit entre 0 et le plus grand indice de offset list qui est sa taille -1
			
			previous_value=value_pr
			next_value=previous_value+offset_list[random[i]]
			if next_value < param2["domain"]["values"]["min"]:
				egg[param1_element][param3].insert(param4,param2["domain"]["values"]["min"])
			elif next_value >param2["domain"]["values"]["max"]: 
				egg[param1_element][param3].insert(param4,param2["domain"]["values"]["max"])
			else:
				egg[param1_element][param3].insert(param4,next_value)

			i=i+1



	############################	
	# quantitatif:dis :end
	############################








	############################	
	# quantitatif:con : begin
	############################

	if param2['domain']['type']=="quantitatif:con":
		
		
		random = norm.rvs(size=len(param1))

		i=0
		for param1_element in param1 :

			value_pr=egg[param1_element][param3][param4-1] # value of the previous element, needed for offset

			
			previous_value=value_pr
			try:
				offset = (random[i]*param2['evolution']['offset']['distribution']['sigma'])+param2['evolution']['offset']['distribution']['mean']
			except:
				print '\n***************\n*\n*	Error: Check the configuration of property: ',param3,'\n*\n***************'
			
			if offset > param2['evolution']['offset']['max']:
				offset = param2['evolution']['offset']['max']
			elif offset < param2['evolution']['offset']['min']:
				offset = param2['evolution']['offset']['min']

			next_value=round(previous_value+offset,1)
			
			if next_value < param2["domain"]["values"]["min"]:
				egg[param1_element][param3].insert(param4,param2["domain"]["values"]["min"])
			elif next_value >param2["domain"]["values"]["max"]: 
				egg[param1_element][param3].insert(param4,param2["domain"]["values"]["max"])
			else:
				egg[param1_element][param3].insert(param4,next_value)

			i=i+1
	############################	
	# quantitatif:con : end
	############################

	return egg
示例#38
0
def identify_reporter_metabolites(cobra_model,
                                  reaction_scores_dict,
                                  number_of_randomizations=1000,
                                  scoring_metric='default',
                                  score_type='p',
                                  entire_network=False,
                                  background_correction=True,
                                  ignore_external_boundary_reactions=False):
    """Calculate the aggregate Z-score for the metabolites in the model.
    Ignore reactions that are solely spontaneous or orphan. Allow the scores to
    have multiple columns / experiments.   This will change the way the output
    is represented.

    cobra_model: A cobra.Model object

    TODO: CHANGE TO USING DICTIONARIES for the_reactions: the_scores

    reaction_scores_dict:  A dictionary where the keys are reactions in
    cobra_model.reactions and the values are the scores.  Currently, only
    supports a single numeric value as the value; however, this will be updated
    to allow for lists

    number_of_randomizations: Integer.  Number of random shuffles of the
    scores to assess which are significant.

    scoring_metric: default means divide by k**0.5

    score_type: 'p' Is the only option at the moment and indicates p-value.

    entire_network: Boolean. Currently, only compares scores calculated from
    the_reactions

    background_correction: Boolean.  If True apply background correction to the
    aggreagate Z-score

    ignore_external_boundary_reactions: Not yet implemented. Boolean.  If True
    do not count exchange reactions when calculating the score.
    """

    # Add in a function to calculate based on correlation coefficients and to
    # deal with other multidimensional data.
    the_reactions = reaction_scores_dict.keys()
    the_scores = reaction_scores_dict.values()
    if score_type == 'p' and not hasattr(the_scores[0], '__iter__'):
        # minimum and maximum p-values are used to prevent numerical problems.
        # haven't decided whether an arbitrary min / max 1e-15 is preferred to
        # blunting the ends based on the values closest to 0 or 1.
        the_reactions = reaction_scores_dict.keys()
        the_scores = array(reaction_scores_dict.values())
        minimum_p = min(the_scores[the_scores.nonzero()[0]])
        maximum_p = max(the_scores[where(the_scores < 1)[0]])
        the_scores[where(the_scores < minimum_p)] = minimum_p
        the_scores[where(the_scores > maximum_p)] = maximum_p
        the_scores = -norm.ppf(the_scores)
        # update the dictionary with the new scores
        reaction_scores_dict = dict(zip(the_reactions, the_scores))
    elif hasattr(the_scores[0], '__iter__'):
        # In the case that the_scores is a list of lists, assume that each list
        # is the score for each reaction in the_reactions across all reactions.
        # Then for each metabolite, calculate the invnorm(|Pearson Correlation
        # Coefficient| for each reaction pair that it links.
        raise Exception("This isn't implemented yet")

    # Get the connectivity for each metabolite
    the_metabolites = set()
    for x in reaction_scores_dict:
        the_metabolites.update(x._metabolites)

    metabolite_scores = {}
    metabolite_connections = {}
    # Calculate the score for each metabolite
    for the_metabolite in the_metabolites:
        nonspontaneous_connections = [
            x for x in the_metabolite._reaction
            if x.gene_reaction_rule.lower() not in ['s0001', '']
        ]
        tmp_score = 0
        number_of_connections = len(nonspontaneous_connections)
        for the_reaction in nonspontaneous_connections:
            if the_reaction not in reaction_scores_dict:
                if not entire_network:
                    number_of_connections -= 1
                continue
            else:
                tmp_score += reaction_scores_dict[the_reaction]
        metabolite_scores[the_metabolite] = tmp_score
        metabolite_connections[the_metabolite] = number_of_connections

    # NOTE: Doing the corrections based only on the significantly perturbed
    # scores is probably going to underestimate the significance.
    if background_correction:
        correction_dict = {}
        for i in set(metabolite_connections.values()):
            # if entire_network # add in a section to deal with the situation
            # where the entire network structure is considered by only have
            # p-values for a limited subset.
            #
            # Basically, what we're doing here is that for each i we select i
            # scores number_of_randomizations times
            the_random_indices = randint.rvs(0,
                                             len(the_scores),
                                             size=(number_of_randomizations,
                                                   i))
            random_score_distribution = array(
                [sum(the_scores[x])
                 for x in list(the_random_indices)]) / i**0.5
            correction_dict[i] = [
                mean(random_score_distribution),
                std(random_score_distribution, ddof=1)
            ]

    for the_metabolite, the_score in iteritems(metabolite_scores):
        number_of_connections = metabolite_connections[the_metabolite]
        if number_of_connections > 0:
            # Correct based on background distribution
            if background_correction:
                # if the list of scores is only for significant perturbations
                # then the background correction shouldn't be applied because
                # the current sampling method only takes into account
                # the_scores not the entire network.  It'd be more accurate to
                # assign unscored reactions a default score.
                the_score = ((the_score / number_of_connections**.5) -
                             correction_dict[number_of_connections][0]) / \
                    correction_dict[number_of_connections][1]
            else:
                the_score = the_score / number_of_connections**.5
            # Update the score
            metabolite_scores[the_metabolite] = the_score

    return_dictionary = {
        'scores': metabolite_scores,
        'connections': metabolite_connections
    }
    if background_correction:
        return_dictionary['corrections'] = correction_dict

    return return_dictionary
示例#39
0
def identify_reporter_metabolites(cobra_model, reaction_scores_dict,
                                  number_of_randomizations=1000, number_of_layers=1,
                                  scoring_metric='default', score_type='p',
                                  entire_network=False, background_correction=True,
                                  ignore_external_boundary_reactions=False):
    """Calculate the aggregate Z-score for the metabolites in the model.
    Ignore reactions that are solely spontaneous or orphan. Allow the scores to
    have multiple columns / experiments.   This will change the way the output
    is represented.

    cobra_model: A cobra.Model object

    TODO: CHANGE TO USING DICTIONARIES for the_reactions: the_scores

    reaction_scores_dict:  A dictionary where the keys are reactions in cobra_model.reactions
    and the values are the scores.  Currently, only supports a single numeric value as
    the value; however, this will be updated to allow for lists

    number_of_randomizations: Integer.  Number of random shuffles of the
    scores to assess which are significant.

    number_of_layers: 1 is the only option supported
    
    scoring_metric:  default means divide by k**0.5

    score_type: 'p' Is the only option at the moment and indicates p-value.

    entire_network: Boolean.  Currently, only compares scores calculated from the_reactions

    background_correction: Boolean.  If True apply background correction to the aggreagate
    Z-score

    ignore_external_boundary_reactions: Not yet implemented. Boolean.  If True do not count exchange reactions when
    calculating the score.

    

    
    """
    #Add in a function to calculate based on correlation coefficients and to
    #deal with other multidimensional data. 
    the_reactions = reaction_scores_dict.keys()
    the_scores = reaction_scores_dict.values()
    if score_type == 'p' and not hasattr(the_scores[0], '__iter__'):
        #minimum and maximum p-values are used to prevent numerical problems.
        #haven't decided whether an arbitrary min / max 1e-15 is preferred to
        #blunting the ends based on the values closest to 0 or 1.
        the_reactions = reaction_scores_dict.keys()
        the_scores = array(reaction_scores_dict.values())
        minimum_p = min(the_scores[the_scores.nonzero()[0]])
        maximum_p = max(the_scores[where(the_scores < 1)[0]])
        the_scores[where(the_scores < minimum_p)] = minimum_p
        the_scores[where(the_scores > maximum_p)] = maximum_p
        the_scores = -norm.ppf(the_scores)
        #update the dictionary with the new scores
        reaction_scores_dict = dict(zip(the_reactions, the_scores))
    elif hasattr(the_scores[0], '__iter__'):
        #In the case that the_scores is a list of lists, assume that each list is
        #the score for each reaction in the_reactions across all reactions.  Then
        #for each metabolite, calculate the invnorm(|Pearson Correlation
        #Coefficient| for each reaction pair that it links.
        raise Exception("This isn't implemented yet")
    
    #Get the connectivity for each metabolite
    the_metabolites = set()
    [the_metabolites.update(x._metabolites)
     for x in reaction_scores_dict];

    metabolite_scores = {}
    metabolite_connections = {}
    #Calculate the score for each metabolite
    for the_metabolite in the_metabolites:
        nonspontaneous_connections = [x for x in the_metabolite._reaction
                                      if x.gene_reaction_rule.lower() not in
                                      ['s0001', '']]
        tmp_score = 0
        number_of_connections = len(nonspontaneous_connections)
        for the_reaction in nonspontaneous_connections:
            if the_reaction not in reaction_scores_dict:
                if not entire_network:
                    number_of_connections -= 1
                continue
            else:
                tmp_score += reaction_scores_dict[the_reaction]
        metabolite_scores[the_metabolite] = tmp_score
        metabolite_connections[the_metabolite] = number_of_connections

    #NOTE: Doing the corrections based only on the significantly perturbed scores
    #is probably going to underestimate the significance.
    if background_correction:
        correction_dict = {}
        for i in set(metabolite_connections.values()):
            #if entire_network # add in a section to deal with the situation where
            #the entire network structure is considered by only have p-values for
            #a limited subset.
            #
            #Basically, what we're doing here is that for each i we select i
            #scores number_of_randomizations times
            the_random_indices = randint.rvs(0,len(the_scores), size=(number_of_randomizations, i))
            random_score_distribution = array([sum(the_scores[x]) for x in list(the_random_indices)]) /i**0.5
            correction_dict[i] = [mean(random_score_distribution),
                                      std(random_score_distribution,ddof=1)] 

    for the_metabolite, the_score in metabolite_scores.iteritems():
        number_of_connections = metabolite_connections[the_metabolite]
        if number_of_connections > 0:
            #Correct based on background distribution
            if background_correction:
                #if the list of scores is only for significant perturbations then the
                #background correction shouldn't be applied because the current sampling
                #method only takes into account the_scores not the entire network.
                #It'd be more accurate to assign unscored reactions a default score.
                the_score = ((the_score / number_of_connections**.5) -
                             correction_dict[number_of_connections][0]) / \
                             correction_dict[number_of_connections][1]
            else:
                the_score = the_score / number_of_connections**.5
            #Update the score
            metabolite_scores[the_metabolite] = the_score



    return_dictionary = {'scores': metabolite_scores,
                         'connections': metabolite_connections}
    if background_correction:
        return_dictionary['corrections'] = correction_dict

    return(return_dictionary)
示例#40
0
 def _fit(self, X, y):
     from scipy.stats import randint
     randidx = randint.rvs(0, len(y), size=10)
     counts = np.bincount(randidx)
     self.majority_ = np.argmax(counts)