def __init__(self, X=None, Y=None, min_parent=2, max_depth=np.inf, min_score=-1, n_features=None): """ Constructor for TreeRegressor (decision tree regression model). Parameters ---------- X : numpy array N x M numpy array which contains N data points with M features. Y : numpy array 1 x N numpy array that contains values the relate to the data points in X. min_parent : int Minimum number of data required to split a node. min_score : int Minimum value of score improvement to split a node. max_depth : int Maximum depth of the decision tree. n_features : int Number of available features for splitting at each node. """ self.L = arr([0]) # indices of left children self.R = arr([0]) # indices of right children self.F = arr([0]) # feature to split on (-1 = leaf = predict) self.T = arr([0]) # threshold to split on (prediction value if leaf) if type(X) is np.ndarray and type(Y) is np.ndarray: self.train(X, Y, min_parent, max_depth, min_score, n_features) # train if data is provided
def drawing_housing_units_nogqs(db, frequencies, weights, index_matrix, sp_matrix, pumano = 0): dbc = db.cursor() dbc.execute('select hhlduniqueid from hhld_sample group by hhlduniqueid') hhld_colno = dbc.rowcount hh_colno = hhld_colno synthetic_population=[] j = 0 for i in index_matrix[:hh_colno,:]: if i[1] == i[2] and frequencies[j]>0: synthetic_population.append([sp_matrix[i[1]-1, 2] , frequencies[j], i[0]]) else: cumulative_weights = weights[sp_matrix[i[1]-1:i[2], 2]].cumsum() probability_distribution = cumulative_weights / cumulative_weights[-1] probability_lower_limit = probability_distribution.tolist() probability_lower_limit.insert(0,0) probability_lower_limit = arr(probability_lower_limit) random_numbers = random.rand(frequencies[j]) freq, probability_lower_limit = histogram(random_numbers, probability_lower_limit) hhldid_by_type = sp_matrix[i[1]-1:i[2],2] for k in range(len(freq)): if freq[k]<>0: #hhid = hhidRowDict[hhldid_by_type[k]] # storing the matrix row no, freq, type synthetic_population.append([hhldid_by_type[k], freq[k], i[0]]) j = j + 1 dbc.close() db.commit() return arr(synthetic_population, int)
def __min_weighted_var(self, tsorted, can_split, n): """ This is a helper method that finds the minimum weighted variance among all split points. Used in: __dectree_train """ # compute mean up to and past position j (for j = 0..n) y_cum_to = np.cumsum(tsorted, axis=0) y_cum_pa = y_cum_to[-1] - y_cum_to mean_to = y_cum_to / arr(range(1, n + 1)) mean_pa = y_cum_pa / arr(list(range(n - 1, 0, -1)) + [1]) # compute variance up to, and past position j (for j = 0..n) y2_cum_to = np.cumsum(np.power(tsorted, 2), axis=0) y2_cum_pa = y2_cum_to[-1] - y2_cum_to var_to = (y2_cum_to - 2 * mean_to * y_cum_to + list(range(1, n + 1)) * np.power(mean_to, 2)) / list(range(1, n + 1)) var_pa = (y2_cum_pa - 2 * mean_pa * y_cum_pa + list(range(n - 1, -1, -1)) * np.power(mean_pa, 2)) / arr(list(range(n - 1, 0, -1)) + [1]) var_pa[-1] = np.inf # find minimum weighted variance among all split points weighted_variance = arr(range(1, n + 1)) / n * var_to + arr(range(n - 1, -1, -1)) / n * var_pa val = np.nanmin((weighted_variance + 1) / (can_split + 1e-100)) # nan versions of min functions must be used to ignore nans idx = np.nanargmin((weighted_variance + 1) / (can_split + 1e-100)) # find only splittable points return (val,idx)
def predict(self, X): """ This method makes a nearest neighbor prediction on test data X. Parameters ---------- X : numpy array N x M numpy array that contains N data points with M features. """ ntr,mtr = arr(self.X_train).shape # get size of training data nte,mte = arr(X).shape # get size of test data if m_tr != m_te: raise ValueError('knnRegress.predict: training and prediction data must have the same number of features') Y_te = np.tile(self.Y_train[0], (n_te, 1)) # make Y_te the same data type as Y_train K = min(self.K, n_tr) # can't have more than n_tr neighbors for i in range(n_te): dist = np.sum(np.power((self.X_train - X[i]), 2), axis=1) # compute sum of squared differences sorted_dist = np.sort(dist, axis=0)[:K] # find nearest neihbors over X_train and... sorted_idx = np.argsort(dist, axis=0)[:K] # ...keep nearest K data points wts = np.exp(-self.alpha * sorted_dist) Y_te[i] = arr(wts) * arr(self.Y_train[sorted_idx]).T / np.sum(wts) # weighted average return Y_te
def __init__(self, *args, **kwargs): """ Constructor for treeRegress (decision tree regression model) Parameters: see "train" function; calls "train" if arguments passed Properties (internal use only) L,R : indices of left & right child nodes in the tree F,T : feature index & threshold for decision (left/right) at this node for leaf nodes, T[n] holds the prediction for leaf node n """ self.L = arr([0]) # indices of left children self.R = arr([0]) # indices of right children self.F = arr([0]) # feature to split on (-1 = leaf = predict) self.T = arr([0]) # threshold to split on (prediction value if leaf) self.information_gain = dict() self.nX = dict() #keeps track of remaining data on that branch self.nY = dict() #left branch and right branch # self.bestval = dict() self.div = defaultdict(list) # [best_feat,best_thresh] self.gain = defaultdict(int) # best_val if len(args) or len(kwargs): # if we were given optional arguments, self.train(*args, **kwargs) # just pass them through to "train"
def predictSoft(self, X): """ This method makes a "soft" nearest-neighbor prediction on test data. Parameters ---------- X : M x N numpy array M = number of testing instances; N = number of features. """ mtr,ntr = arr(self.X_train).shape # get size of training data mte,nte = arr(X).shape # get size of test data if nte != ntr: raise ValueError('Training and prediction data must have same number of features') num_classes = len(self.classes) prob = np.zeros((mte,num_classes)) # allocate memory for class probabilities K = min(self.K, mtr) # (can't use more neighbors than training data points) for i in range(mte): # for each test example... # ...compute sum of squared differences... dist = np.sum(np.power(self.X_train - arr(X)[i,:], 2), axis=1) # ...find nearest neighbors over training data and keep nearest K data points sorted_dist = np.sort(dist, axis=0)[0:K] indices = np.argsort(dist, axis=0)[0:K] wts = np.exp(-self.alpha * sorted_dist) count = [] for c in range(len(self.classes)): # total weight of instances of that classes count.append(np.sum(wts[self.Y_train[indices] == self.classes[c]])) count = np.asarray(count) prob[i,:] = np.divide(count, np.sum(count)) # save (soft) results return prob
def test_bothModels(self): fun1 = functions.DistanceToCircle(arr([ 10, 10]), .5) fun2 = functions.DistanceToCircle(arr([-10, -10]), 5) set = dfo_model.MultiFunctionModel([fun1, fun2], self.b, self.center, self.radius) set.improve(None) center = arr([3,4]) for i in range(50): print("testing " + str(i) + " of " + str(50)) rFactor = self.getRFactor() newRadius = set.modelRadius * rFactor center = center + set.modelRadius / newRadius set.testNewModelCenter(center) set.setNewModelCenter(center) set.multiplyRadius(rFactor) set.improve('images/test_both_%04d_improve.png' % i) quadmod1 = set.getQuadraticModels(arr([0, 1], int)) quadmod2 = set.getQuadraticModels2(arr([0, 1], int)) for j in range(10): x = center + 10 * (2 * random.random(2) - 1) y1 = quadmod1.evaluate(x) y2 = quadmod2.evaluate(x) self.assertTrue(norm(y1 - y2) < self.tolerance) y1 = quadmod1.jacobian(x) y2 = quadmod2.jacobian(x) self.assertTrue(norm(y1 - y2) < self.tolerance)
def plot_lum(): clf() j_3min = [8052.06, 3050.04, 324.251, 20082.0, 1443.05, 1070.26, 1879.54, 3210.33, 312.932, 233.877, 714.423, 112.846, 126.616] j_3min2 = [8052.06, 3050.04, 324.251, 1443.05, 1070.26, 1879.54, 3210.33, 312.932, 233.877, 714.423, 112.846, 126.616] j_3min3 = [3050.04, 324.251, 1443.05, 1070.26, 1879.54, 3210.33, 312.932, 233.877, 714.423, 112.846, 126.616] j_3min = [8052.06, 3050.04, 324.251, 20082.0, 1443.05, 1070.26, 1879.54, 3210.33, 312.932, 233.877, 714.423, 188.211, 1594, 57.29, 833466.82317] #convert to cgs from microjansky: j_3min = arr(j_3min)*10**(-29) #convert to AB magnitude: j_3min = -2.5*numpy.log10(j_3min) - 48.60 hist(j_3min,13) xlabel('$m_j$', fontsize=28) ylabel('Number', fontsize=28) yticks(arr([0, 1., 2., 3., 4.])) ax = matplotlib.pyplot.gca() ax.set_xlim(ax.get_xlim()[::-1]) # reversing the xlimits savefig('Lum_dist.eps') clf() # hist(j_3min,20,cumulative=True, histtype='step') # hist(j_3min2,20,cumulative=True, histtype='step') # hist(j_3min3,20,cumulative=True, histtype='step') #ylim(0,14) #xlim(-1000,22000) #xlabel('J Flux at 3 Minutes (Micro Jansky)') # savefig('lum_dist.eps') return j_3min
def drawing_housing_units(db, frequencies, weights, index_matrix, sp_matrix, pumano=0): dbc = db.cursor() dbc.execute("select hhlduniqueid from hhld_pums group by hhlduniqueid") hhld_colno = dbc.rowcount dbc.execute("select gquniqueid from gq_pums group by gquniqueid") gq_colno = dbc.rowcount hh_colno = hhld_colno + gq_colno synthetic_population = [] j = 0 for i in index_matrix[:hh_colno, :]: if i[1] == i[2] and frequencies[j] > 0: synthetic_population.append([sp_matrix[i[1] - 1, 2] + 1, frequencies[j], i[0]]) print "hhid single", sp_matrix[i[1] - 1, 2] else: cumulative_weights = weights[sp_matrix[i[1] - 1 : i[2], 2]].cumsum() probability_distribution = cumulative_weights / cumulative_weights[-1] probability_lower_limit = probability_distribution[:-1].tolist() probability_lower_limit.insert(0, 0) probability_lower_limit = arr(probability_lower_limit) random_numbers = random.rand(frequencies[j]) freq, probability_lower_limit = histogram(random_numbers, probability_lower_limit) hhldid_by_type = sp_matrix[i[1] - 1 : i[2], 2] for k in range(len(freq)): if freq[k] <> 0: synthetic_population.append([hhldid_by_type[k] + 1, freq[k], i[0]]) j = j + 1 dbc.close() db.commit() return arr(synthetic_population)
def addVars(self): bus,branch,_,_, n,nl,_,_,_,_,gens = self.data + self.aux if self.verbose: print 'defining variables' INF = 1e100 if self.solver == 'cplex': p = ['p_%d'%i for i in gens] a = ['a_%d'%i for i in gens] D = ['D_%d'%i for i in bus] t = ['t_%d'%i for i in bus] m = ['m{}'.format(i['id']) for i in branch] s = ['s{}'.format(i['id']) for i in branch] self.M.variables.add(names = p + a) self.M.variables.add(names = D + t, lb = [-INF]*2*n) #self.M.variables.add(names = m, lb = [-INF]*nl) #self.M.variables.add(names = s) self.M.variables.add(names = m + s, lb = [-INF]*2*nl) D, t = arr(D), arr(t) self.var = (p, a, D, t, m, s) else: p = {i: self.M.addVar(name='pbar_%d'%i) for i in gens} a = {i: self.M.addVar(name='alpha_%d'%i) for i in gens} D = {i: self.M.addVar(lb=-INF, name='delta_%d'%i) for i in bus} t = {i: self.M.addVar(lb=-INF, name='theta_%d'%i) for i in bus} m = {i['id']: self.M.addVar(lb=-INF, name='fbar{}'.format(i['id'])) for i in branch} s = {i['id']: self.M.addVar(lb=-INF, name='std{}'.format(i['id'])) for i in branch} self.var = (p, a, D, t, m, s) self.M.update()
def init_weights(self, sizes, init='zeros', X=None, Y=None): """ This method initializes the weights of the neural network. Set layer sizes to S = [Ninput, N1, N2, ... Noutput] and set using 'fast' method ('none', 'random', 'zeros'). Refer to constructor doc string for argument descriptions. TODO: implement autoenc implement regress """ init = init.lower() if init == 'none': pass # no init: do nothing elif init == 'zeros': self.wts = arr([np.zeros((sizes[i + 1], sizes[i] + 1)) for i in range(len(sizes) - 1)], dtype=object) elif init == 'random': self.wts = arr([.25 * np.random.randn(sizes[i + 1], sizes[i] + 1) for i in range(len(sizes) - 1)], dtype=object) elif init == 'autoenc': pass elif init == 'regress': pass else: raise ValueError('NNetRegress.init_weights: \'' + init + '\' is not a valid argument for init')
def MixedN(ls): """ ls: a list of either lists or dictionaries. """ if (len(ls)==1): if type(ls[0])==list: return [item/float(sum(ls[0])) for item in ls[0]] elif type(ls[0])==dict: return {key:value/float(sum(ls[0].values())) for key, value in ls[0].items()} lamb = 1.0/len(ls) if (sum([type(it)==list for it in ls])==len(ls)): total=arr([0]*len(ls[0])); for it in ls: total= total + arr([n/float(sum(it)) for n in it]) mix = total*lamb return mix elif (sum([type(it)==dict for it in ls])==len(ls)): keys=set([]) for it in ls: keys.update(set(it.keys())) mix={key:sum([(float(1)/sum(it.values()))*it.get(key, 0)*lamb for it in ls]) for key in keys} return mix
def create_adjusted_frequencies(db, synthesis_type, control_variables, pumano, tract= 0, bg= 0): dbc = db.cursor() dummy_order_string = create_aggregation_string(control_variables) puma_table = ('%s_%s_joint_dist'%(synthesis_type, pumano)) pums_table = ('%s_%s_joint_dist'%(synthesis_type, 0)) dbc.execute('select * from %s where tract = %s and bg = %s order by %s' %(puma_table, tract, bg, dummy_order_string)) puma_joint = arr(dbc.fetchall(), float) puma_prob = puma_joint[:,-2] / sum(puma_joint[:,-2]) upper_prob_bound = 0.5 / sum(puma_joint[:,-2]) dbc.execute('select * from %s order by %s' %(pums_table, dummy_order_string)) pums_joint = arr(dbc.fetchall(), float) pums_prob = pums_joint[:,-2] / sum(pums_joint[:,-2]) puma_adjustment = (pums_prob <= upper_prob_bound) * pums_prob + (pums_prob > upper_prob_bound) * upper_prob_bound correction = 1 - sum((puma_prob == 0) * puma_adjustment) puma_prob = ((puma_prob <> 0) * correction * puma_prob + (puma_prob == 0) * puma_adjustment) puma_joint[:,-2] = sum(puma_joint[:,-2]) * puma_prob dbc.execute('delete from %s where tract = %s and bg = %s'%(puma_table, tract, bg)) puma_joint_dummy = str([tuple(i) for i in puma_joint]) dbc.execute('insert into %s values %s' %(puma_table, puma_joint_dummy[1:-1])) dbc.close() db.commit()
def __dectree_train(self, X, Y, L, R, F, T, next, depth, minParent, maxDepth, minScore, nFeatures): """ This is a recursive helper method that recusively trains the decision tree. Used in: train TODO: compare for numerical tolerance """ n,d = mat(X).shape # check leaf conditions... if n < minParent or depth >= maxDepth or np.var(Y) < minScore: assert n != 0, ('TreeRegress.__dectree_train: tried to create size zero node') return self.__output_leaf(Y, n, L, R, F, T, next) best_val = np.inf best_feat = -1 try_feat = np.random.permutation(d) # ...otherwise, search over (allowed) features for i_feat in try_feat[0:nFeatures]: dsorted = arr(np.sort(X[:,i_feat].T)).ravel() # sort data... pi = np.argsort(X[:,i_feat].T) # ...get sorted indices... tsorted = Y[pi].ravel() # ...and sort targets by feature ID can_split = np.append(arr(dsorted[:-1] != dsorted[1:]), 0) # which indices are valid split points? if not np.any(can_split): # no way to split on this feature? continue # find min weighted variance among split points val,idx = self.__min_weighted_var(tsorted, can_split, n) # save best feature and split point found so far if val < best_val: best_val = val best_feat = i_feat best_thresh = (dsorted[idx] + dsorted[idx + 1]) / 2 # if no split possible, output leaf (prediction) node if best_feat == -1: return self.__output_leaf(Y, n, L, R, F, T, next) # split data on feature i_feat, value (tsorted[idx] + tsorted[idx + 1]) / 2 F[next] = best_feat T[next] = best_thresh go_left = X[:,F[next]] < T[next] my_idx = next next += 1 # recur left L[my_idx] = next L,R,F,T,next = self.__dectree_train(X[go_left,:], Y[go_left], L, R, F, T, next, depth + 1, minParent, maxDepth, minScore, nFeatures) # recur right R[my_idx] = next L,R,F,T,next = self.__dectree_train(X[np.logical_not(go_left),:], Y[np.logical_not(go_left)], L, R, F, T, next, depth + 1, minParent, maxDepth, minScore, nFeatures) return (L,R,F,T,next)
def load_data_from_csv(csv_path, label_index, trans_func=lambda x: x): """ Function that loads from a CSV into main memory. Parameters ---------- csv_path : str Path to CSV file that contains data. label_indes : int The index in the CSV rows that contains the label for each data point. trans_func : function object Function that transform values in CSV, i.e.: str -> int. Returns ------- data,labels : (list) Tuple that contains a list of data points (index 0) and a list of labels corresponding to thos data points (index 1). """ data = [] labels = [] with open(csv_path) as f: csv_data = reader(f) for row in csv_data: row = list(map(trans_func, row)) labels.append(row.pop(label_index)) data.append(row) return arr(data),arr(labels)
def prepare_control_marginals(db, synthesis_type, control_variables, varCorrDict, controlAdjDict, state, county, tract, bg, hhldsizeMargsMod=False): dbc = db.cursor() marginals = database(db, '%s_marginals'%synthesis_type) variable_names = marginals.variables() control_marginals = [] #control_marginals_sum = [] for dummy in control_variables: dbc.execute('select %s from %s_sample group by %s' %(dummy, synthesis_type, dummy)) cats = arr(dbc.fetchall(), float) #print dummy, cats selVar = dummy selGeography = "%s,%s,%s,%s" %(state, county, tract, bg) variable_marginals1=[] try: #print hhldsizeMargsMod if (not hhldsizeMargsMod and synthesis_type == 'hhld') or synthesis_type <> 'hhld': #print 'household not modified in correspondence' variable_marginals_adj = controlAdjDict[selGeography][selVar] #print 'adjustment', variable_marginals_adj[0], variable_marginals_adj[1] for i in variable_marginals_adj[1]: if i>0: variable_marginals1.append(i) else: variable_marginals1.append(0.1) #check_marginal_sum = sum(variable_marginals1) else: raise Exception, 'Household marginal distributions modified to account for person total inconsistency' except Exception ,e: #print 'Exception: %s' %e #check_marginal_sum = 0 for i in cats: corrVar = varCorrDict['%s%s' %(dummy, int(i[0]))] dbc.execute('select %s from %s_marginals where county = %s and tract = %s and bg = %s' %(corrVar, synthesis_type, county, tract, bg)) result = arr(dbc.fetchall(), float) #check_marginal_sum = result[0][0] + check_marginal_sum if result[0][0] > 0: variable_marginals1.append(result[0][0]) else: variable_marginals1.append(0.1) #exceptionStatus = False #if check_marginal_sum == 0 and (synthesis_type == 'hhld'): # exceptionStatus = True #if check_marginal_sum == 0 and (synthesis_type == 'person'): # exceptionStatus = True #if check_marginal_sum == 0 and (synthesis_type == 'hhld' or synthesis_type == 'person'): # print 'Exception: The given marginal distribution for a control variable sums to zero.' #raise Exception, 'The given marginal distribution for a control variable sums to zero.' control_marginals.append(variable_marginals1)
def populate_master_matrix(db, pumano, hhld_units, gq_units, hhld_dimensions, gq_dimensions): # First we create an empty matrix based on the dimensions of the hhhld, gq control variables hhld_types = arr(hhld_dimensions).prod() gq_types = arr(gq_dimensions).prod() # We add 2 more columns to also store the puma id, and housing pums id. Also note that the matrix indices start from 0 # Layout of the master matrix is as follows - puma id (0 th column), housing pums id, hhld types frequency, # gq types frequency total_cols = 4 + hhld_types + gq_types total_rows = hhld_units + gq_units matrix = sparse.lil_matrix((total_rows, total_cols)) # In this part we populate the matrix dbc = db.cursor() rowHhidDict = {} row = 0 for control_type in ['hhld', 'gq']: # Here we determine the starting column in the master matrix for the hhld types, gq types frequency within each home if control_type == 'hhld': start = 3 elif control_type == 'gq': start = 3 + arr(hhld_dimensions).prod() # Read the pums data from the mysql files to if pumano == 0 or pumano == 99999: dbc.execute('Select state, pumano, hhid, serialno, %suniqueid from %s_sample order by hhid' %(control_type, control_type)) else: dbc.execute('Select state, pumano, hhid, serialno, %suniqueid from %s_sample where pumano = %s order by hhid' %(control_type, control_type, pumano)) result = arr(dbc.fetchall(), int64) # Master Matrix is populated here if control_type == 'hhld': for i in result[:,2]: # Storing the pumano, housing puma id for all housing units rowHhidDict[i] = row matrix[row,:4] = result[row,:4] row = row + 1 if control_type == 'gq': for i in result[:,2]: rowHhidDict[i] = row matrix[row,:4] = result[(row - hhld_units), :4] row = row + 1 # Populating the household type, gq type for i in range(dbc.rowcount): matRow = rowHhidDict[result[i, 2]] matrix[matRow, start+result[i, -1]] = matrix[matRow, start+result[i, -1]] + 1 dbc.close() db.commit() return matrix
def create_joint_dist(db, synthesis_type, control_variables, dimensions, pumano = 0, tract = 0, bg = 0): dbc = db.cursor() pums = database(db, '%s_pums'%synthesis_type) dummy = create_aggregation_string(control_variables) table_rows = dimensions.cumprod()[-1] table_cols = len(dimensions) + 4 dummy_table = zeros((table_rows, table_cols), dtype =int) index_array = num_breakdown(dimensions) try: dbc.execute('create table %s_%s_joint_dist select %s from %s_pums where 0 '%(synthesis_type, pumano, dummy, synthesis_type)) dbc.execute('alter table %s_%s_joint_dist add pumano int first'%(synthesis_type, pumano)) dbc.execute('alter table %s_%s_joint_dist add tract int after pumano'%(synthesis_type, pumano)) dbc.execute('alter table %s_%s_joint_dist add bg int after tract'%(synthesis_type, pumano)) dbc.execute('alter table %s_%s_joint_dist add frequency float(27)'%(synthesis_type, pumano)) dbc.execute('alter table %s_%s_joint_dist add index(tract, bg)'%(synthesis_type, pumano)) except: # print 'Table %s_%s_joint_dist present' %(synthesis_type, pumano) pass variable_list = 'pumano, tract, bg, ' for i in control_variables: variable_list = variable_list + i + ', ' variable_list = variable_list + 'frequency' if pumano ==0: dbc.execute('select %s, count(*), %suniqueid from %s_pums group by %s '%(dummy, synthesis_type, synthesis_type, dummy)) #print ('select %s, count(*), %suniqueid from %s_pums group by %s '%(dummy, synthesis_type, synthesis_type, dummy)) result = arr(dbc.fetchall()) dummy_table[:,:3] = [pumano, tract, bg] dummy_table[:,3:-1] = index_array dummy_table[result[:,-1]-1,-1] = result[:,-2] else: dbc.execute('select %s, count(*), %suniqueid from %s_pums where pumano = %s group by %s '%(dummy, synthesis_type, synthesis_type, pumano, dummy)) result = arr(dbc.fetchall()) dummy_table[:,:3] = [pumano, tract, bg] dummy_table[:,3:-1] = index_array dummy_table[result[:,-1]-1,-1] = result[:,-2] dbc.execute('delete from %s_%s_joint_dist where tract = %s and bg = %s' %(synthesis_type, pumano, tract, bg)) dummy_table = str([tuple(i) for i in dummy_table]) #try: # dbc.execute('alter table %s_%s_joint_dist drop column %suniqueid' %(synthesis_type, pumano, synthesis_type)) #except: # pass dbc.execute('insert into %s_%s_joint_dist (%s) values %s' %(synthesis_type, pumano, variable_list, dummy_table[1:-1])) dbc.close() update_string = create_update_string(db, control_variables, dimensions) add_unique_id(db, '%s_%s_joint_dist' %(synthesis_type, pumano), synthesis_type, update_string) db.commit()
def create_whole_frequencies(db, synthesis_type, order_string, pumano = 0, tract = 0, bg = 0): dbc = db.cursor() table_name = ('%s_%s_ipf'%(synthesis_type, pumano)) try: dbc.execute('create table %s select pumano, tract, bg, frequency from hhld_%s_joint_dist where 0;' %(table_name, pumano)) dbc.execute('alter table %s change frequency marginal float(27)'%(table_name)) dbc.execute('alter table %s add prior int default 0' %(table_name)) dbc.execute('alter table %s add r_marginal int default 0'%(table_name)) dbc.execute('alter table %s add diff_marginals float(27) default 0'%(table_name)) dbc.execute('alter table %s add %suniqueid int'%(table_name, synthesis_type)) dbc.execute('alter table %s add index(tract, bg)'%(table_name)) except: pass dbc.execute('select frequency from %s_%s_joint_dist where tract = %s and bg = %s order by %s;' %(synthesis_type, pumano, tract, bg, order_string)) frequency = arr(dbc.fetchall()) dbc.execute('select frequency from %s_0_joint_dist order by %s' %(synthesis_type, order_string)) prior = arr(dbc.fetchall()) rowcount = dbc.rowcount dummy_table = zeros((rowcount, 6)) dummy_table[:,:-3] = [pumano, tract, bg] dummy_table[:,-3] = frequency[:,0] dummy_table[:,-2] = prior[:,0] dummy_table[:,-1] = (arange(rowcount)+1) dbc.execute('delete from %s where tract = %s and bg = %s' %(table_name, tract, bg)) dummy_table = str([tuple(i) for i in dummy_table]) dbc.execute('insert into %s (pumano, tract, bg, marginal, prior, %suniqueid) values %s;' %(table_name, synthesis_type, dummy_table[1:-1])) dbc.execute('update %s set r_marginal = marginal where tract = %s and bg = %s'%(table_name, tract, bg)) dbc.execute('update %s set diff_marginals = (marginal - r_marginal) * marginal where tract = %s and bg = %s'%(table_name, tract, bg)) dbc.execute('select sum(marginal) - sum(r_marginal) from %s where tract = %s and bg = %s'%(table_name, tract, bg)) result = dbc.fetchall() diff_total = round(result[0][0]) if diff_total < 0: dbc.execute('select %suniqueid from %s where r_marginal <>0 and tract = %s and bg = %s order by diff_marginals '%(synthesis_type, table_name, tract, bg)) else: dbc.execute('select %suniqueid from %s where marginal <>0 and tract = %s and bg = %s order by diff_marginals desc'%(synthesis_type, table_name, tract, bg)) result = dbc.fetchall() # print 'The marginals corresponding to the following hhldtypes were changed by the given amount' for i in range(int(abs(diff_total))): # print 'record - %s changed by %s' %(result[i][0], diff_total / abs(diff_total)) dbc.execute('update %s set r_marginal = r_marginal + %s where %suniqueid = %s and tract = %s and bg = %s' %(table_name, diff_total / abs(diff_total), synthesis_type, result[i][0], tract, bg)) dbc.execute('select r_marginal from %s where prior <> 0 and tract = %s and bg = %s order by %suniqueid'%(table_name, tract, bg, synthesis_type)) marginals = arr(dbc.fetchall()) dbc.close() db.commit() return marginals
def tolerance (adjustment_all, adjustment_old, iteration, parameters): adjustment_all = arr(adjustment_all) adjustment_old = arr(adjustment_old) adjustment_difference = abs(adjustment_all - adjustment_old) adjustment_convergence_characteristic = adjustment_difference.cumsum()[-1] if adjustment_convergence_characteristic > parameters.ipfTol: return 1 else: # print "Convergence Criterion - %s" %adjustment_convergence_characteristic return 0
def constaint_weights(self): """Rescales the weights so that the maximum element of the matrix is 1. """ max_values = self.weight_constaint_function() for i in range(len(self.weights)): if arr(max_values[i]) > 1.0: print "Constaining weights No " + str(i) + ": Divide by " + str(arr(max_values[i])) self.weights[i].set_value(np.float32(self.weights[i].get_value() / arr(max_values[i]))) print "New max value: " + str(arr(self.weight_constaint_function()[i]))
def __init__( self, result_dir, data_X, data_y, data_t, valid_X, valid_y, valid_t, test_X, layers, weight_init_scheme, cost_function, hyperparams, training_sequence, ): """Class that takes layers and combines them into a neural network. Theano functions are created and the given data is then trained with the GPU on the constructed neural network through the train method. """ self.cost_function = cost_function self.H = hyperparams if training_sequence: self.training_sequence = training_sequence else: self.training_sequence = self.default_training_sequence self.layers = layers self.rng = np.random.RandomState(1234) self.time1 = time.time() self.ensemble = None self.ensemble_MNIST = None self.ensemble_softmax = None self.hook_functions = None self.weight_init_scheme = weight_init_scheme self.init_weight_and_data_variables(data_X, data_y, data_t, valid_X, valid_y, valid_t, test_X) self.init_theano_variables() self.train_history = np.float32(arr(range(self.H.L.epochs))) self.valid_history = np.float32(arr(range(self.H.L.epochs))) self.result_dir = result_dir self.hook_functions_batch = None self.hook_functions_crossvalid = None self.hook_functions_crossvalid_epoch = None pass
def drawing_with_replacement(db, frequencies, weights, index_matrix, sp_matrix, pumano = 0, seed=0, iteration=0): if seed == 0: seed = int(frequencies.sum()) random.seed(seed+iteration) dbc = db.cursor() dbc.execute('select hhlduniqueid from hhld_sample group by hhlduniqueid') hhld_colno = dbc.rowcount dbc.execute('select gquniqueid from gq_sample group by gquniqueid') gq_colno = dbc.rowcount hh_colno = hhld_colno + gq_colno synthetic_population=[] j = 0 for i in index_matrix[:hh_colno,:]: if i[1] == i[2] and frequencies[j]>0: synthetic_population.append([sp_matrix[i[1]-1, 2] , frequencies[j], i[0]]) else: cumulative_weights = weights[sp_matrix[i[1]-1:i[2], 2]].cumsum() probability_distribution = cumulative_weights / cumulative_weights[-1] ti = time.time() #print probability_distribution, type(probability_distribution) probability_lower_limit = probability_distribution.tolist() probability_lower_limit.insert(0,0) probability_lower_limit = arr(probability_lower_limit) #print 'after insertion and conversion - ', probability_lower_limit, type(probability_lower_limit) #print 'time taken - %.4f' %(time.time()-ti) ti = time.time() random_numbers = random.rand(frequencies[j]) freq, probability_lower_limit = histogram(random_numbers, probability_lower_limit) #print 'time taken for random number generation and histogram - %.4f' %(time.time()-ti) ti = time.time() hhldid_by_type = sp_matrix[i[1]-1:i[2],2] freqValid = freq[freq<>0] hhldid_by_typeValid = hhldid_by_type[freq<>0] ti = time.time() for k in range(len(freqValid)): synthetic_population.append([hhldid_by_typeValid[k], freqValid[k], i[0]]) #print 'Old implementation - %.4f' %(time.time()-ti) j = j + 1 dbc.close() db.commit() return arr(synthetic_population, int)
def adjust_weights(db, synthesis_type, control_variables, varCorrDict, controlAdjDict, state, county, pumano=0, tract=0, bg=0, parameters=0, hhldsizeMargsMod=False): dbc = db.cursor() control_marginals = prepare_control_marginals (db, synthesis_type, control_variables, varCorrDict, controlAdjDict, state, county, tract, bg, hhldsizeMargsMod) tol = 1 iteration = 0 adjustment_old = [] target_adjustment = [] while (tol): iteration = iteration +1 adjustment_all = [] for i in range(len(control_variables)): adjusted_marginals = marginals(db, synthesis_type, control_variables[i], pumano, tract, bg) for j in range(len(adjusted_marginals)): if adjusted_marginals[j] == 0: adjusted_marginals[j] = 1 adjustment = arr(control_marginals[i]) / arr(adjusted_marginals) update_weights(db, synthesis_type, control_variables, control_variables[i], pumano, tract, bg, adjustment) for k in adjustment: adjustment_all.append(k) if iteration == 1: if k == 0: adjustment_old.append(0) else: adjustment_old.append(k/k) target_adjustment = [adjustment_old] tol = tolerance(adjustment_all, adjustment_old, iteration, parameters) adjustment_old = adjustment_all adjustment_characteristic = abs(arr(adjustment_all) - arr(target_adjustment)).sum() / len(adjustment_all) if not tol: print control_variables[i], control_marginals[i], adjusted_marginals if (iteration>=parameters.ipfIter): pass # print "Maximum iterations reached\n" else: # print "Convergence Achieved in iterations - %s\n" %iteration pass # print "Marginals off by - %s" %adjustment_characteristic dbc.close() db.commit()
def err(self, X, Y): """ This method computes the error rate on test data. Parameters --------- X : M x N numpy array M = number of data points; N = number of features. Y : M x 1 numpy array Array of classes (targets) corresponding to the data points in X. """ Y = arr( Y ) Yhat = arr( self.predict(X) ) return np.mean(Yhat.reshape(Y.shape) != Y)
def __dectree_train(self, X, Y, L, R, F, T, next, minParent, minScore, nFeatures): """ Zach, Sharon, and Janice's decision tree training function: based on handling complexity through the maximum number of leaves. TODO: 1) Create a structure that holds the [decision and information gain (from that decision)] for each possible node 2) Iterate through and create tree: // within a while loop (while leaves != maxLeaves) ROOT: (when leaves == 0). choose the one with most(???) entropy from all possible take ROOT out of a. At the creation of each new tree node (or leaf), calculate the new [decision and info gain] pairs that become available b. construct tree """ n, d = mat(X).shape if n < minParent or np.var(Y) < minScore: assert n != 0, ('TreeRegress.__dectree_train: tried to create size zero node') # TODO: return something. maybe get rid of this whole conditional since it seems to be only used # for recursion halting. best_val = np.inf best_feat = -1 try_feat = np.random.permutation(d) # ...otherwise, search over (allowed) features for i_feat in try_feat[0:nFeatures-1]: dsorted = arr(np.sort(X[:,i_feat].T)).ravel() # sort data... pi = np.argsort(X[:,i_feat].T) # ...get sorted indices... tsorted = Y[pi].ravel() # ...and sort targets by feature ID can_split = np.append(arr(dsorted[:-1] != dsorted[1:]), 0) # which indices are valid split points? if not np.any(can_split): # no way to split on this feature? continue # find min weighted variance among split points val,idx = self.__min_weighted_var(tsorted, can_split, n) # save best feature and split point found so far if val < best_val: best_val = val best_feat = i_feat best_thresh = (dsorted[idx] + dsorted[idx + 1]) / 2 return best_feat, best_thresh, best_val
def plot_lum_rest(): '''f_{rest,V} = f_{rest_corr}*[nu_V/ ((1+z)nu_J)]^beta for flux \propto nu^beta and beta negative values''' clf() f_rest_corr = [2252.14, 1626.48, 403.717, 11783.2, 913.329, 549.616, 286.863, 990.110, 14.7689, 174.540, 1419.79, 149309.80115] beta = [-1.35, -0.8, -0.96, -0.22, -1.73, -0.84, -3.48, -0.42, -3.81, -0.3, -1.7, -0.47] z_list_limits = [1.1588, 2.4274, 1.51, 0.54, 1.95, 1.6, 3.036, 2.346, 3.5, 1.165, 4.8, 0.9382] arrf = arr(f_rest_corr) arrb = arr(beta) arrz = arr(z_list_limits) nu_V = 5.444646098003629764065335753176043557e+14 nu_J = 2.398339664e+14 f_rest_V = arrf * (nu_V/ ((1+arrz)*nu_J))**(beta) print 'f_rest_V in microjansky:' print f_rest_V #convert to cgs from microjansky: f_rest_V = f_rest_V*10**(-29) print 'f_rest_V in cgs:' print f_rest_V #get luminosity distance from cosmocalc (lambdaCDM: omega_M = 0.27 and omega_lambda=0.73) dist = [] for redshift in z_list_limits: dist += [cosmocalc.cosmocalc(z=redshift)['DL_cm']] arrd = arr(dist) print 'dist:' print arrd L_rest_V = f_rest_V*4*numpy.pi*arrd**2./(1.+arrz) print 'L_rest_V:' print L_rest_V #convert to ABSOLUTE AB magnitude: parsec = 3.085677581e18 # cm F_10pc = L_rest_V/(4 * numpy.pi * (10*parsec)**2) #flux density at 10 parsecs Absol_Mag = -2.5*numpy.log10(F_10pc) - 48.60 #Absolute mag in AB mag hist(Absol_Mag,6) xlabel('$M_v$', fontsize=27) ylabel('Number', fontsize=28) yticks(arr([0, 1., 2., 3., 4.])) ax = matplotlib.pyplot.gca() ax.set_xlim(ax.get_xlim()[::-1]) # reversing the xlimits savefig('Lum_dist_rest.eps') print 'Done' return Absol_Mag
def train(self, X, Y, minParent=2, maxDepth=np.inf, nFeatures=None): """ Trains a random forest classification tree. Parameters ---------- X : M x N numpy array of M data points with N features each. Y : M x 1 numpy array containing class labels for each data point in X. minParent : (int) The minimum number of data required to split a node. maxDepth : (int) The maximum depth of the decision tree. nFeatures : (int) The number of available features for splitting at each node. """ n,d = arr(X).shape nFeatures = d if nFeatures is None else min(nFeatures,d) minScore = -1 self.classes = list(np.unique(Y)) if len(self.classes) == 0 else self.classes Y = toIndex(Y) sz = min(2 * n, 2**(maxDepth + 1)) # pre-allocate storage for tree: L, R, F, T = np.zeros((sz,)), np.zeros((sz,)), np.zeros((sz,)), np.zeros((sz,)) L, R, F, T, last = self.__dectree_train(X, Y, L, R, F, T, 0, 0, minParent, maxDepth, minScore, nFeatures) self.L = L[0:last] self.R = R[0:last] self.F = F[0:last] self.T = T[0:last]
def shuffle_data(X, Y): """ Shuffle data in X and Y. Parameters ---------- X : numpy array N x M array of data to shuffle. Y : numpy arra 1 x N array of labels that correspond to data in X. Returns ------- X or (X,Y) : numpy array or tuple of arrays Shuffled data (only returns X and Y if Y contains data). TODO: test more """ nx,dx = twod(X).shape Y = arr(Y).flatten() ny = len(Y) pi = np.random.permutation(nx) X = X[pi,:] if ny > 0: assert ny == nx, 'shuffle_data: X and Y must have the same length' Y = Y[pi] return X,Y return X
def __init__(self, X=None, Y=None, stepsize=.01, tolerance=1e-4, max_steps=5000, init='zeros'): """ Constructor for LogisticMSEClassifier (logistic classifier with MSE loss function.). Parameters ---------- X : N x M numpy array N = number of data points; M = number of features. Y : 1 x N numpy array Class labels that relate to the data points in X. stepsize : scalar Step size for gradient descent (decreases as 1/iter). tolerance : scalar Tolerance for stopping criterion. max_steps : int Max number of steps to take before training stops. init : str Initialization method; one of the following strings: 'keep' (to keep current value), 'zeros' (init to all-zeros), 'randn' (init at random), and 'linreg' (init w/ small linear regression). """ self.wts = [] # linear weights on features (1st is constant) self.classes = arr([-1, 1]) # list of class values used in input if type(X) is np.ndarray and type(Y) is np.ndarray: self.train(X, Y, stepsize, tolerance, max_steps, init.lower())
def createBasePlotAt(self, centerX, r, title='Current Step', mf=None): fig = plt.figure() fig.set_size_inches(sys_utils.get_plot_size(), sys_utils.get_plot_size()) ax1 = fig.add_subplot(111) matplotlib.rcParams['xtick.direction'] = 'out' matplotlib.rcParams['ytick.direction'] = 'out' x = linspace(centerX[0] - r, centerX[0] + r, num=100) y = linspace(centerX[1] - r, centerX[1] + r, num=100) X, Y = meshgrid(x, y) Z = empty((len(y), len(x))) plt.title(title) for i in range(0, len(x)): for j in range(0, len(y)): Z[j, i] = self.objective(arr([x[i], y[j]])) CS = plt.contour(X, Y, Z, 6, colors='k') plt.clabel(CS, fontsize=9, inline=1) if mf is not None: for i in range(0, len(x)): for j in range(0, len(y)): Z[j, i] = mf(arr([x[i], y[j]])) CS = plt.contour(X, Y, Z, 6, colors='y') plt.clabel(CS, fontsize=9, inline=1) for idx in range(0, self.getNumEqualityConstraints()): for i in range(0, len(x)): for j in range(0, len(y)): Z[j, i] = self.equalityConstraints(arr([x[i], y[j]]))[idx] CS = plt.contour(X, Y, Z, 6, colors='r') plt.clabel(CS, fontsize=9, inline=1) for idx in range(0, self.getNumInequalityConstraints()): for i in range(0, len(x)): for j in range(0, len(y)): Z[j, i] = self.inequalityConstraints(arr([x[i], y[j]]))[idx] CS = plt.contour(X, Y, Z, 6, colors='b') plt.clabel(CS, fontsize=9, inline=1) return ax1
def load_dataset(self): cfg = self.cfg file_name = os.path.join(self.cfg.project_path, cfg.dataset) # Load Matlab file dataset annotation mlab = sio.loadmat(file_name) self.raw_data = mlab mlab = mlab["dataset"] num_images = mlab.shape[1] # print('Dataset has {} images'.format(num_images)) data = [] has_gt = True for i in range(num_images): sample = mlab[0, i] item = DataItem() item.image_id = i base = str(self.cfg["project_path"]) im_path = os.path.join(base, sample[0][0]) item.im_path = im_path item.im_size = sample[1][0] if len(sample) >= 3: joints = sample[2][0][0] # print(sample) joint_id = joints[:, 0] # make sure joint ids are 0-indexed if joint_id.size != 0: assert (joint_id < cfg.num_joints).any() joints[:, 0] = joint_id coords = [joint[1:] for joint in joints] coords = arr(coords) item.coords = coords item.joints = [joints] item.joint_id = [arr(joint_id)] # print(item.joints) else: has_gt = False # if cfg.crop: # crop = sample[3][0] - 1 # item.crop = extend_crop(crop, cfg.crop_pad, item.im_size) data.append(item) self.has_gt = has_gt return data
def read_single_2d_data(data: pd.DataFrame): length = len(data.index) index = arr(data.index) bp_interested = get_bp_interested(data) #bp_interested=['snout', 'leftear', 'rightear', 'tailbase'] coords = np.zeros((length, len(bp_interested), 2)) scores = np.zeros((length, len(bp_interested))) for bp_idx, bp in enumerate(bp_interested): bp_coords = arr(data[bp]) coords[index, bp_idx, :] = bp_coords[:, :2] scores[index, bp_idx] = bp_coords[:, 2] return {'length': length, 'coords': coords, 'scores': scores}
def __call__(self, W, X): for i, x in zip(self.inputs, X): i.val = x for layer in self.layers: for node in layer: node(W) return arr([out(W) for out in self.outputs])
def make_batch(self, data_item, scale, mirror): im_file = data_item.im_path logging.debug('image %s', im_file) logging.debug('mirror %r', mirror) image = imread(im_file, mode='RGB') if self.has_gt: joints = np.copy(data_item.joints) if self.cfg.crop: crop = data_item.crop image = image[crop[1]:crop[3] + 1, crop[0]:crop[2] + 1, :] if self.has_gt: joints[:, 1:3] -= crop[0:2].astype(joints.dtype) img = imresize(image, scale) if scale != 1 else image scaled_img_size = arr(img.shape[0:2]) if mirror: img = np.fliplr(img) batch = {Batch.inputs: img} if self.has_gt: stride = self.cfg.stride if mirror: joints = [ self.mirror_joints(person_joints, self.symmetric_joints, image.shape[1]) for person_joints in joints ] sm_size = np.ceil(scaled_img_size / (stride * 2)).astype(int) * 2 scaled_joints = [ person_joints[:, 1:3] * scale for person_joints in joints ] joint_id = [ person_joints[:, 0].astype(int) for person_joints in joints ] part_score_targets, part_score_weights, locref_targets, locref_mask = self.compute_target_part_scoremap( joint_id, scaled_joints, data_item, sm_size, scale) batch.update({ Batch.part_score_targets: part_score_targets, Batch.part_score_weights: part_score_weights, Batch.locref_targets: locref_targets, Batch.locref_mask: locref_mask }) batch = {key: data_to_input(data) for (key, data) in batch.items()} batch[Batch.data_item] = data_item return batch
def __init__(self, *args, **kwargs): """ Constructor for treeRegress (decision tree regression model) Parameters: see "train" function; calls "train" if arguments passed Properties (internal use only) L,R : indices of left & right child nodes in the tree F,T : feature index & threshold for decision (left/right) at this node for leaf nodes, T[n] holds the prediction for leaf node n """ self.L = arr([0]) # indices of left children self.R = arr([0]) # indices of right children self.F = arr([0]) # feature to split on (-1 = leaf = predict) self.T = arr([0]) # threshold to split on (prediction value if leaf) if len(args) or len(kwargs): # if we were given optional arguments, self.train(*args, **kwargs) # just pass them through to "train"
def data_gauss(N0, N1=None, mu0=arr([0, 0]), mu1=arr([1, 1]), sig0=np.eye(2), sig1=np.eye(2)): """Sample data from a two-component Gaussian mixture model. Args: N0 (int): Number of data to sample for class -1. N1 :(int) Number of data to sample for class 1. mu0 (arr): numpy array mu1 (arr): numpy array sig0 (arr): numpy array sig1 (arr): numpy array Returns: X (array): Array of sampled data Y (array): Array of class values that correspond to the data points in X. TODO: test more """ # ALT: return data_GMM_new(N0, ((1.,[0,0],[1.])) # return data_GMM_new(N0+N1, ((.5,[0,0],[1.]),(.5,[1,1],[1.]))) if not N1: N1 = N0 d1, d2 = twod(mu0).shape[1], twod(mu1).shape[1] if d1 != d2 or np.any(twod(sig0).shape != arr([d1, d1])) or np.any( twod(sig1).shape != arr([d1, d1])): raise ValueError('data_gauss: dimensions should agree') X0 = np.dot(np.random.randn(N0, d1), sqrtm(sig0)) X0 += np.ones((N0, 1)) * mu0 Y0 = -np.ones(N0) X1 = np.dot(np.random.randn(N1, d1), sqrtm(sig1)) X1 += np.ones((N1, 1)) * mu1 Y1 = np.ones(N1) X = np.row_stack((X0, X1)) Y = np.concatenate((Y0, Y1)) return X, Y
def compute_target_part_scoremap_numpy( self, joint_id, coords, data_item, size, scale ): dist_thresh = float(self.cfg.pos_dist_thresh * scale) dist_thresh_sq = dist_thresh ** 2 num_joints = self.cfg.num_joints scmap = np.zeros(cat([size, arr([num_joints])])) locref_size = cat([size, arr([num_joints * 2])]) locref_mask = np.zeros(locref_size) locref_map = np.zeros(locref_size) width = size[1] height = size[0] grid = np.mgrid[:height, :width].transpose((1, 2, 0)) for person_id in range(len(coords)): for k, j_id in enumerate(joint_id[person_id]): joint_pt = coords[person_id][k, :] j_x = np.asscalar(joint_pt[0]) j_x_sm = round((j_x - self.half_stride) / self.stride) j_y = np.asscalar(joint_pt[1]) j_y_sm = round((j_y - self.half_stride) / self.stride) min_x = round(max(j_x_sm - dist_thresh - 1, 0)) max_x = round(min(j_x_sm + dist_thresh + 1, width - 1)) min_y = round(max(j_y_sm - dist_thresh - 1, 0)) max_y = round(min(j_y_sm + dist_thresh + 1, height - 1)) x = grid.copy()[:, :, 1] y = grid.copy()[:, :, 0] dx = j_x - x * self.stride - self.half_stride dy = j_y - y * self.stride - self.half_stride dist = dx ** 2 + dy ** 2 mask1 = dist <= dist_thresh_sq mask2 = (x >= min_x) & (x <= max_x) mask3 = (y >= min_y) & (y <= max_y) mask = mask1 & mask2 & mask3 scmap[mask, j_id] = 1 locref_mask[mask, j_id * 2 + 0] = 1 locref_mask[mask, j_id * 2 + 1] = 1 locref_map[mask, j_id * 2 + 0] = (dx * self.locref_scale)[mask] locref_map[mask, j_id * 2 + 1] = (dy * self.locref_scale)[mask] weights = self.compute_scmap_weights(scmap.shape, joint_id, data_item) return scmap, weights, locref_map, locref_mask
def drawing_housing_units(db, frequencies, weights, index_matrix, sp_matrix, pumano=0): dbc = db.cursor() dbc.execute('select hhlduniqueid from hhld_sample group by hhlduniqueid') hhld_colno = dbc.rowcount dbc.execute('select gquniqueid from gq_sample group by gquniqueid') gq_colno = dbc.rowcount hh_colno = hhld_colno + gq_colno synthetic_population = [] j = 0 for i in index_matrix[:hh_colno, :]: if i[1] == i[2] and frequencies[j] > 0: synthetic_population.append( [sp_matrix[i[1] - 1, 2], frequencies[j], i[0]]) else: cumulative_weights = weights[sp_matrix[i[1] - 1:i[2], 2]].cumsum() probability_distribution = cumulative_weights / cumulative_weights[ -1] probability_lower_limit = probability_distribution.tolist() probability_lower_limit.insert(0, 0) probability_lower_limit = arr(probability_lower_limit) random_numbers = random.rand(frequencies[j]) freq, probability_lower_limit = histogram(random_numbers, probability_lower_limit) hhldid_by_type = sp_matrix[i[1] - 1:i[2], 2] for k in range(len(freq)): if freq[k] <> 0: #hhid = hhidRowDict[hhldid_by_type[k]] # storing the matrix row no, freq, type synthetic_population.append( [hhldid_by_type[k], freq[k], i[0]]) j = j + 1 dbc.close() db.commit() return arr(synthetic_population, int)
def train(self, X, Y, init='zeros', stepsize=.01, tolerance=1e-4, max_steps=5000): """ This method trains the neural network. Refer to constructor doc string for descriptions of arguments. """ if self.wts[0].shape[1] - 1 != len(X[0]): raise ValueError('NNetClassify.__init__: sizes[0] must == len(X) (number of features)') if len(np.unique(Y)) != self.wts[-1].shape[0]: raise ValueError('NNetClassify.__init__: sizes[-1] must == the number of classes in Y') self.classes = self.classes if self.classes else np.unique(Y) # convert Y to 1-of-K format Y_tr_k = to_1_of_k(Y) n,d = mat(X).shape # d = dim of data, n = number of data points nc = len(self.classes) # number of classes L = len(self.wts) # get number of layers # define desired activation function and it's derivative (for training) sig,d_sig, = self.sig, self.d_sig sig_0,d_sig_0 = self.sig_0, self.d_sig_0 # outer loop of stochastic gradient descent iter = 1 # iteration number done = 0 # end of loop flag surr = np.zeros((1, max_steps + 1)).ravel() # surrogate loss values err = np.zeros((1, max_steps + 1)).ravel() # misclassification rate values while not done: step_i = stepsize / iter # step size evolution; classic 1/t decrease # stochastic gradient update (one pass) for i in range(n): A,Z = self.__responses(self.wts, X[i,:], sig, sig_0) # compute all layers' responses, then backdrop delta = (Z[L] - Y_tr_k[i,:]) * arr(d_sig_0(Z[L])) # take derivative of output layer for l in range(L - 1, -1, -1): grad = mat(delta).T * mat(Z[l]) # compute gradient on current layer wts delta = np.multiply(delta.dot(self.wts[l]), d_sig(Z[l]))# propagate gradient downards delta = delta[:,1:] # discard constant feature self.wts[l] = self.wts[l] - step_i * grad # take gradient step on current layer wts err[iter] = self.err_k(X, Y_tr_k) # error rate (classification) surr[iter] = self.mse_k(X, Y_tr_k) # surrogate (mse on output) print('surr[iter]') print(surr[iter]) print('iter') print(iter) # check if finished done = (iter > 1) and (np.abs(surr[iter] - surr[iter - 1]) < tolerance) or iter >= max_steps iter += 1
def updateAcceleration(n, x, y, xmin, ymin, nxCell, dxCell, dyCell, linkHead, linkNext, h, h_sqr, spiky_gradientFac, viscosity_laplacianFac, mass, P, rho, eta, gravity, vx, vy, ax, ay): for iP in prange(n): ax[iP] = 0.0 ay[iP] = 0.0 xi = x[iP] yi = y[iP] axi = ax[iP] ayi = ay[iP] vxi = vx[iP] vyi = vy[iP] Pi = P[iP] # Get the index of the node closest to particle[iP] Ix = np.int(np.round((xi - xmin) / dxCell)) Iy = np.int(np.round((yi - ymin) / dyCell)) # Loop through neighboring cells for Ineigh in [ Ix + Iy * nxCell, Ix + 1 + Iy * nxCell, Ix + (Iy + 1) * nxCell, Ix + 1 + (Iy + 1) * nxCell ]: jP = linkHead[Ineigh] while (jP >= 0): # Negative value = Null if jP == iP: jP = linkNext[jP] continue r_sqr = (xi - x[jP])**2 + (yi - y[jP])**2 if r_sqr < h_sqr: r = np.sqrt(r_sqr) R = arr([(xi - x[jP]) / r, (yi - y[jP]) / r]) # Compute pressure force gradW = spiky_gradientFac * (h - r)**2 * R Fac = -mass[jP] * (Pi + P[jP]) / (2.0 * rho[jP]) axi += Fac * gradW[0] ayi += Fac * gradW[1] # Compute viscous force laplacianW = viscosity_laplacianFac * (h - r) Fac = eta * mass[jP] / rho[jP] * laplacianW axi += Fac * (vx[jP] - vxi) ayi += Fac * (vy[jP] - vyi) # end if dist jP = linkNext[jP] # end jP # end Ineigh axi += rho[iP] * gravity[0] ayi += rho[iP] * gravity[1] ax[iP] = axi / rho[iP] ay[iP] = ayi / rho[iP]
def multi_forward_pass_epoch(): '''use validation data predict output by averaging prediction with active dropout ''' np.set_printoptions(suppress=True) classifications = np.zeros((4200,10)) for i in range(50): classifications = np.add(classifications,(np.float64(arr(nn.feedforward_valid_drop_function())))) #print 'Mean ' + str(np.mean(classifications,axis=0)) return nn.cross_validation_function_dropout(np.float32(classifications/15.))[0]
def multi_forward_pass(batch): '''Use training data to predict output by averaging prediction with active dropout. ''' np.set_printoptions(suppress=True) classifications = np.zeros((150,10)) for i in range(15): classifications = np.add(classifications,(np.float64(arr(nn.feedforward_function(batch))))) #print 'Mean ' + str(np.mean(classifications,axis=0)) return nn.train_error_function_dropout(batch,np.float32(classifications/15.))[0]
def fromIndex(Y, values): """ Convert index-valued Y into discrete representation specified by values in values. Parameters ---------- Y : numpy array 1 x N (or N x 1) numpy array of indices. values : numpy array 1 x max(Y) array of values for conversion. Returns ------- discrete_Y : numpy array 1 x N (or N x 1) numpy array of discrete values. """ discrete_Y = arr(values)[arr(Y)] return discrete_Y
def one_basis_function_plot(): fig = plt.figure() ax = fig.add_subplot(111) # ax = fig.add_axes([-.1,1.1, -.1,1.1]) # Basis function phi_3 x, y = arr([0, .4, .6, .8, 1.]), arr([0., 0., 1., 0., 0.]) ax.plot(x, y, '-m', linewidth=2.0) ax.text(0.6, 1.02, r"$\phi_3$", fontsize=18) ax.set_ylim(-.1, 1.1) ax.set_xlim(-.1, 1.1) # plt.xticks(np.linspace(0.,1.,6),['$x_0$','$x_1$','$x_2$','$x_3$','$x_4$','$x_5$'],fontsize=18) plt.xticks(arr([.4, .6, .8]), ['$x_2$', '$x_3$', '$x_4$'], fontsize=18) # print Axis.get_majorticklabels(plt.axis) plt.savefig("one_basis_function.pdf") # plt.show() plt.clf()
def updateCenters(k, xt, C): #Update the value of centers, return the new k and dictionary without points means = [] for kx in range(k): if len(C[kx]['points']) == 0: pass else: means.append(np.mean(C[kx]['points'], axis=0)) C = createDict(k, arr(means)) return C
def exp2steps(results, colors): plt.clf() x = [17, 35, 43, 48, 55, 70, 124, 170, 323, 403] for alg, result in results.items(): c = next(colors) if alg in 'GS': plt.plot(x, result.avg_steps, label=alg, c=c) plt.fill_between(x, arr(result.avg_steps) - arr(result.std_steps), arr(result.avg_steps) + arr(result.std_steps), alpha=0.5, color=c) plt.scatter(x, result.avg_steps, c=c) plt.xlabel("Wielkość instancji") plt.ylabel("Liczba kroków") plt.xlim(1, 410) plt.legend() plt.grid(True) plt.savefig("plots/exp2steps.pdf", format="pdf", bbox_inches='tight')
def plot_lum(): clf() j_3min = [ 8052.06, 3050.04, 324.251, 20082.0, 1443.05, 1070.26, 1879.54, 3210.33, 312.932, 233.877, 714.423, 112.846, 126.616 ] j_3min2 = [ 8052.06, 3050.04, 324.251, 1443.05, 1070.26, 1879.54, 3210.33, 312.932, 233.877, 714.423, 112.846, 126.616 ] j_3min3 = [ 3050.04, 324.251, 1443.05, 1070.26, 1879.54, 3210.33, 312.932, 233.877, 714.423, 112.846, 126.616 ] j_3min = [ 8052.06, 3050.04, 324.251, 20082.0, 1443.05, 1070.26, 1879.54, 3210.33, 312.932, 233.877, 714.423, 188.211, 1594, 57.29, 833466.82317 ] #convert to cgs from microjansky: j_3min = arr(j_3min) * 10**(-29) #convert to AB magnitude: j_3min = -2.5 * numpy.log10(j_3min) - 48.60 hist(j_3min, 13) xlabel('$m_j$', fontsize=28) ylabel('Number', fontsize=28) yticks(arr([0, 1., 2., 3., 4.])) ax = matplotlib.pyplot.gca() ax.set_xlim(ax.get_xlim()[::-1]) # reversing the xlimits savefig('Lum_dist.eps') clf() # hist(j_3min,20,cumulative=True, histtype='step') # hist(j_3min2,20,cumulative=True, histtype='step') # hist(j_3min3,20,cumulative=True, histtype='step') #ylim(0,14) #xlim(-1000,22000) #xlabel('J Flux at 3 Minutes (Micro Jansky)') # savefig('lum_dist.eps') return j_3min
def update(y, n_states): from numpy.random import randn from numpy import arange from numpy import eye from numpy import array as arr from numpy import cov from numpy.random import multivariate_normal as mvn_rand from sklearn.cluster import KMeans data_dim, data_len = y.shape # --- setting gmm_setting = { # 'update_order': ['M', 'E'], 'update_order': ['E', 'M'], # 'expt_init_mode': 'random', 'expt_init_mode': 'kmeans', } gmm = Gmm(data_dim, n_states, **gmm_setting) # --- Mu mu_mode = 'mvn_rand' if mu_mode == 'zeros': mu = zeros((data_dim, n_states)) elif mu_mode == 'randn': c = 2 mu = randn(data_dim, n_states) * c elif mu_mode == 'arange': mu = arange(data_dim * n_states).reshape(data_dim, n_states) elif mu_mode == 'kmeans': km = KMeans(n_states) km.fit(y.T) mu = km.cluster_centers_.T elif mu_mode == 'mvn_rand': mu = mvn_rand(y.mean(1), cov(y), size=n_states).T else: raise Exception('Not supported: %s' % mu_mode) alpha = ones(n_states) * (data_len / n_states) W = arr([eye(data_dim) * 1e+3 for k in range(n_states)]) W = W.transpose(1, 2, 0) # --- set params prms = {'MuR': {'mu': mu, 'W': W}, 'Pi': {'alpha': alpha}} gmm.set_params(prms) gmm.init_expt_s(data_len, y) # --- plotter s = gmm.expt_s.argmax(0) plotter(y, s, gmm, 'GMM prior', 2) # --- update gmm.update(y, 200) # --- plotter s = gmm.expt_s.argmax(0) plotter(y, s, gmm, 'posterior', 3) predict_y, predict_s, vb = gmm.predict(y)
def plot_settings(self): """ Plotting settings (colors, linewidths etc.), possibly depending on bus variable. """ var = 'Vbase' # base colors etc on Vbase var_lim = [380, 300, 0] # different categories of Vbase, should be a list # bus settings self.sets_variable_lim = var_lim var = self.bus.loc[:, var] self.bus_set = arr([ find(v >= arr(var_lim))[0] if v >= var_lim[-1] else -1 for v in var ]) self.bus_color = ['r', (230. / 255, 152. / 255, 0), 'g'] self.bus_name_color = ['k'] * 3 self.bus_lw = [1.5, 1, 1] self.bus_name_fs = [0, 0, 0] # line settings var_line = var.loc[self.line.bus0] self.line_set = arr([ find(v >= arr(var_lim))[0] if v >= var_lim[-1] else -1 for v in var_line ]) self.line_lw = [1, 1, 1] self.line_color = ['r', (230. / 255, 152. / 255, 0), 'g'] # Link self.link_lw = 1 self.link_color = 'b' # Interactive plot self.interactive = True # interactive map mode self.picker_node = 7 # tolerance for interactive picking self.picker_arc = 3 self.significant_figures = 3 # when info is displayed self.info_fc = [213. / 255, 230. / 255, 1] # color for info box self.info_ec = 'k' # color info-box edge self.info_lw = 1 # info-box edge width self.equal_aspect = False
def get_tracklets_info(tracklets, all_tracklet_prop): tracklets_info = get_tracklets_temporal_info(tracklets) for i, ts in enumerate(tracklets): t_start = tracklets_info[i]['start'] t_end = tracklets_info[i]['end'] t_reid = arr([ all_tracklet_prop[t][ind]['reid'] for t in range(t_start, t_end + 1) for ind in range(len(all_tracklet_prop[t])) if all_tracklet_prop[t][ind]['id'] == ts[t] ]) t_avg_reid = np.mean(t_reid, axis=0) tracklets_info[i]['avg_reid_score'] = t_avg_reid tracklets_info[i]['start_prop_reid'] = t_reid[0] tracklets_info[i]['end_prop_reid'] = t_reid[-1] seq = range(t_start, t_end + 1) tracklets_info[i]['avg_area'] = np.mean([ all_tracklet_prop[t][ind]['area'] for t in seq for ind in range(len(all_tracklet_prop[t])) if all_tracklet_prop[t][ind]['id'] == ts[t] ]) t_score = [ all_tracklet_prop[t][ind]['score'] for t in range(t_start, t_end + 1) for ind in range(len(all_tracklet_prop[t])) if all_tracklet_prop[t][ind]['id'] == ts[t] ] t_avg_score = np.mean( arr(t_score), axis=0) if len(t_score) != 0 else arr([ all_tracklet_prop[t_start][ind]['score'] for ind in range(len(all_tracklet_prop[t_start])) if all_tracklet_prop[t_start][ind]['id'] == ts[t_start] ]) tracklets_info[i]['avg_score'] = t_avg_score return tracklets_info
def get_old_key(self): """ :param file: :return: """ keyNames = [] keyValues = [] foundOne = False for var in self.f['Master-Parameters']['Variables']: if not self.f['Master-Parameters']['Variables'][var].attrs['Constant']: foundOne = True keyNames.append(var) keyValues.append(arr(self.f['Master-Parameters']['Variables'][var])) if foundOne: if len(keyNames) > 1: return keyNames, arr(np.transpose(arr(keyValues))) else: return keyNames[0], arr(keyValues[0]) else: return 'No-Variation', arr([1])
def __init__(self, path: str, **params) -> None: self.path = path self.size = 0 self.compname, self.comptype = None, None self.content = arr([]) if access(path, F_OK): self.load() if params: self.setparams(**params) return raise FileNotFoundError('for new files you need to provide params!')
def force(self, vertex1, vertex2): """ Calculates the inverse-r^2 force (given by the E field) between two charged points (vertices). Note the obvious difference with real physics: q1 + q2, not q1*q2. It just makes things that look nicer. """ graph = self._graph_dict dx = graph[vertex2]["loc"][0] - graph[vertex1]["loc"][0] dy = graph[vertex2]["loc"][1] - graph[vertex1]["loc"][1] q1 = graph[vertex1]["weight"] q2 = graph[vertex2]["weight"] return (q1 + q2) * arr([dx, dy]) / (dx**2 + dy**2)**1.5
def __init__(self, *args, **kwargs): """Constructor for decision tree base class Args: *args, **kwargs (optional): passed to train function Properties (internal use only) L,R (arr): indices of left & right child nodes in the tree F,T (arr): feature index & threshold for decision (left/right) at this node P (arr): for leaf nodes, P[n] holds the prediction for leaf node n """ self.L = arr([]) # indices of left children self.R = arr([]) # indices of right children self.F = arr([]) # feature to split on (-1 = leaf = predict) self.T = arr([]) # threshold to split on self.P = arr([]) # prediction value for node self.sz = 0 # size; also next node during construction if len(args) or len(kwargs): # if we were given optional arguments, self.train(*args, **kwargs) # just pass them through to "train"
def predict(self, X): """ This method makes a prediction on X using learned linear coefficients. Parameters ---------- X : numpy array N x M numpy array that contains N data points with M features. """ X_te = np.concatenate((np.ones((mat(X).shape[0],1)), X), axis=1) # extend features by including a constant feature return arr(mat(X_te) * mat(self.theta).T)
def logmatprod(ln_a, ln_b): ''' ln_[i, j] = log(sum(exp(ln a[i, ...] + ln b[:, j]))) parameters ln_a: np.array(size_A, ...) ln_b: np.array(size_A, size_B) returns ln_C: np.array(size_A, size_B) ''' from numpy import zeros ln_a = arr([ln_a]) if ln_a.ndim == 1 else ln_a ln_b = arr([ln_b]) if ln_b.ndim == 1 else ln_b I = ln_a.shape[0] J = ln_b.shape[1] ln_C = zeros((I, J)) for i in range(I): for j in range(J): ln_C[i, j] = logsumexp(ln_a[i] + ln_b[:, j], -1) return ln_C
def train(self, X, Y, init='zeros', stepsize=.01, tolerance=1e-4, max_steps=500): """ This method trains the neural network. Refer to constructor doc string for descriptions of arguments. """ n, d = mat( X).shape # d = number of features; n = number of training data L = len(self.wts) + 1 # number of layers # define desired activation function and its derivative for training sig, d_sig, sig_0, d_sig_0 = self.sig, self.d_sig, self.sig_0, self.d_sig_0 # outer loop of gradient descent iter = 1 # iteration number done = 0 # end of loop flag surr = np.zeros((1, max_steps + 1)).ravel() # surrogate loss values errs = np.zeros((1, max_steps + 1)).ravel() # error rate values while not done: step_i = stepsize / iter # stochastic gradient update for i in range(n): A, Z = self.__responses( self.wts, X[i, :], sig, sig_0) # compute all layers' responses, then backdrop delta = (Z[L - 1] - Y[i]) * d_sig_0( Z[L - 1]) # take derivative of output layer for l in range(L - 2, 0, -1): grad = arr( mat(delta).T * mat(Z[l])) # compute gradient on current layer weights delta = np.dot(delta, self.wts[l]) * d_sig( Z[l]) # propagate gradient downwards delta = delta[1:] # discard constant feature self.wts[l] = self.wts[ l] - step_i * grad # take gradient step on current layer weights # compute current error values errs[iter] = self.mse(X, Y) # surrogate (mse on output) # check stopping conditions done = iter > 1 and (abs(errs[iter] - errs[iter - 1]) < tolerance or iter >= max_steps) iter += 1 wts_old = self.wts
def predict(self, X): """ This method makes predictions on the test data X. Parameters ---------- X : M x N numpy array of M data points (N features each) at which to predict """ Y_te = self.__dectree_test(X, self.L, self.R, self.F, self.T, 0).T.ravel() return arr([[self.classes[int(i)]] for i in np.ravel(Y_te)])
def marginals(db, synthesis_type, variable_name, pumano, tract, bg): # Returns the marginals wrt the entered dimension for calculating the adjustment in each iteration dbc = db.cursor() dbc.execute('select %s, sum(frequency) from %s_%s_joint_dist where tract = %s and bg = %s group by %s' %( variable_name, synthesis_type, pumano, tract, bg, variable_name)) result = arr(dbc.fetchall(), float) marginal = [] for i in result: marginal.append(float(i[1])) dbc.close() db.commit() return marginal