def format_target_indices_for_regression_conditioning(data,unreduced_fitness_values,fitness_funcs, fitness_dim): #create fitness targets fitness_targets= _format_fitness_targets_regression(fitness_funcs,unreduced_fitness_values, fitness_dim) #create fitness indices fitness_size= len(list(ut.flatten(fitness_targets))) data_size=len(list(ut.flatten(data[0]))) indices=np.arange(data_size,data_size+fitness_size) return indices,fitness_targets
def format_data_for_conditional(parent_sample,parent_vars,sibling_samples,sibling_vars,sibling_order): #flatten list for calculation of cond distr #parent condition variable parent_values=list(ut.flatten([parent_sample.values["ind"][var.name] for var in parent_vars])) #sibling condition variable #only retrieve values of necessary siblings,for sibling order i take last i siblings sibling_values=list(ut.flatten([sibling.values["ind"][var.name] for var in sibling_vars for sibling in sibling_samples[-sibling_order:]])) #the order of values is by convention, also enforced in the sampling and learning procedure values=np.concatenate((parent_values,sibling_values)) #the first X are cond attributes indices=np.arange(0,len(values)) return indices,values
def format_target_indices_for_regression_conditioning(data, unreduced_fitness_values, fitness_funcs, fitness_dim): #create fitness targets fitness_targets = _format_fitness_targets_regression( fitness_funcs, unreduced_fitness_values, fitness_dim) #create fitness indices fitness_size = len(list(ut.flatten(fitness_targets))) data_size = len(list(ut.flatten(data[0]))) indices = np.arange(data_size, data_size + fitness_size) return indices, fitness_targets
def format_data_for_training(parent, parent_var_names, siblings, sibling_var_names): data = list( ut.flatten([parent.values["ind"][name] for name in parent_var_names] + [[child.values["ind"][name] for name in sibling_var_names] for child in siblings])) return data
def format_data_for_conditional(parent_sample, parent_vars, sibling_samples, sibling_vars, sibling_order): #flatten list for calculation of cond distr #parent condition variable parent_values = list( ut.flatten( [parent_sample.values["ind"][var.name] for var in parent_vars])) #sibling condition variable #only retrieve values of necessary siblings,for sibling order i take last i siblings sibling_values = list( ut.flatten([ sibling.values["ind"][var.name] for var in sibling_vars for sibling in sibling_samples[-sibling_order:] ])) #the order of values is by convention, also enforced in the sampling and learning procedure values = np.concatenate((parent_values, sibling_values)) #the first X are cond attributes indices = np.arange(0, len(values)) return indices, values
def _reduce_fitness_dimension(fitness_value_line, fitness_dim,fitness_comb): if fitness_dim[0] is FitnessInstanceDim.seperate and fitness_dim[1] is FitnessFuncDim.seperate: return fitness_value_line if fitness_dim[0] is FitnessInstanceDim.single and fitness_dim[1] is FitnessFuncDim.seperate: return [_combine_fitness(fn_func_value,fitness_comb) for fn_func_value in fitness_value_line] if fitness_dim[0] is FitnessInstanceDim.single and fitness_dim[1] is FitnessFuncDim.single: #turn back in array because the fitness value array should be 2D return [_combine_fitness(list(ut.flatten(fitness_value_line)),fitness_comb)] if fitness_dim[0] is FitnessInstanceDim.seperate and fitness_dim[1] is FitnessFuncDim.single: #group the fitness values per siblings is not possible because some fitness functions can be filtered for certain siblings raise ValueError("Seperate siblings and single fitness function values is not supported.")
def visualise(root_layout_sample,color_list,child_samples=None,ax=None): import model.mapping as mp if child_samples is None: samples=root_layout_sample.get_flat_list() else: samples=[root_layout_sample]+child_samples #map list by name polygon_dict=dict([(sample.name,[]) for sample in samples]) for sample in samples: polygon_dict[sample.name].append(mp.map_layoutsample_to_geometricobject(sample,"shape")) for name,color in color_list: print() vis.draw_polygons(polygons=polygon_dict[name],ax=ax,color=color) xrange,yrange=ut.range_from_polygons(list(ut.flatten(polygon_dict.values()))) ax=vis.get_ax(ax) ax.set_xlim(*xrange) ax.set_ylim(*yrange)
def visualise(root_layout_sample, color_list, child_samples=None, ax=None): import model.mapping as mp if child_samples is None: samples = root_layout_sample.get_flat_list() else: samples = [root_layout_sample] + child_samples #map list by name polygon_dict = dict([(sample.name, []) for sample in samples]) for sample in samples: polygon_dict[sample.name].append( mp.map_layoutsample_to_geometricobject(sample, "shape")) for name, color in color_list: print() vis.draw_polygons(polygons=polygon_dict[name], ax=ax, color=color) xrange, yrange = ut.range_from_polygons( list(ut.flatten(polygon_dict.values()))) ax = vis.get_ax(ax) ax.set_xlim(*xrange) ax.set_ylim(*yrange)
def _reduce_fitness_dimension(fitness_value_line, fitness_dim, fitness_comb): if fitness_dim[0] is FitnessInstanceDim.seperate and fitness_dim[ 1] is FitnessFuncDim.seperate: return fitness_value_line if fitness_dim[0] is FitnessInstanceDim.single and fitness_dim[ 1] is FitnessFuncDim.seperate: return [ _combine_fitness(fn_func_value, fitness_comb) for fn_func_value in fitness_value_line ] if fitness_dim[0] is FitnessInstanceDim.single and fitness_dim[ 1] is FitnessFuncDim.single: #turn back in array because the fitness value array should be 2D return [ _combine_fitness(list(ut.flatten(fitness_value_line)), fitness_comb) ] if fitness_dim[0] is FitnessInstanceDim.seperate and fitness_dim[ 1] is FitnessFuncDim.single: #group the fitness values per siblings is not possible because some fitness functions can be filtered for certain siblings raise ValueError( "Seperate siblings and single fitness function values is not supported." )
def format_data_for_training(parent,parent_var_names,siblings,sibling_var_names): data=list(ut.flatten([parent.values["ind"][name] for name in parent_var_names]+[[child.values["ind"][name] for name in sibling_var_names] for child in siblings])) return data
def fitness_value_bounds(fitness_values): return [(np.min(list(ut.flatten(np.array(fitness_values)[:,i]))),np.max(list(ut.flatten(np.array(fitness_values)[:,i])))) for i in range(len(fitness_values[0]))]
def training(model,fitness_funcs,sibling_var_names,parent_var_names): sibling_order_sequence=training_params.sibling_order_sequence gmm_full=training_params.gmm_full sibling_data=training_params.sibling_data fitness_dim=training_params.sibling_data n_data=training_params.n_data poisson=training_params.poisson n_iter=training_params.n_iter n_trial=training_params.n_trial n_model_eval_data=training_params.n_model_eval_data n_components=training_params.n_components min_covar=training_params.min_covar regression=training_params.regression #experiment hyperparameters: fitness_average_threshhold=0.95 fitness_func_threshhold=0.98 #this sequence indicates the order of the markov chain between siblings [1,2]-> second child depends on the first #the third on the first and second #the first child is always independent sibling_data=dg.SiblingData.first fitness_dim=(dtfr.FitnessInstanceDim.seperate,dtfr.FitnessFuncDim.seperate) #the sibling order defines the size of the joint distribution that will be trained sibling_order=np.max(sibling_order_sequence) n_children=sibling_order+1 #gmm marginalisation [order_1,order_2,..,order_sibling_order] #True->train full joint #False->derive (marginalise) from closest higher order child_name="child" #model to train on parent_node,parent_def=model child_nodes=parent_node.children[child_name] #training variables and fitness functions #this expliicitly also defines the format of the data sibling_vars=[parent_def.children[child_name].variables[name] for name in sibling_var_names] parent_vars=[parent_def.variables[name] for name in parent_var_names] if not all( var.stochastic() for var in sibling_vars+parent_vars): non_stoch_var=[var.name for var in sibling_vars+parent_vars if not var.stochastic()] raise ValueError("Only the distribution of stochastic variables can be trained on. The variables "+ str(non_stoch_var)+ "are not stochastic") #check if none of the vars is deterministic #this expliicitly also defines the format of the data #fitness func, order cap and regression target #fitness_funcs=[fn.Targetting_Fitness("Minimum distances",fn.min_dist_sb,fn.Fitness_Relation.pairwise_siblings,1,0,1,target=1),fn.Fitness("polygon overlap",fn.negative_overlap_pc,fn.Fitness_Relation.pairwise_parent_child,1,0,1)] #only the func order and cap is used for training model_evaluation = mev.ModelEvaluation(n_model_eval_data,parent_def,parent_node,parent_var_names, child_name,sibling_var_names, fitness_funcs, fitness_average_threshhold,fitness_func_threshhold) score=model_evaluation.evaluate() startscore=score delta_score=0 print("score before training: ", score) #check sibling sequence wrong_sequence = any(sibling_order_sequence[i]>i for i in range(len(sibling_order_sequence))) if wrong_sequence: print(sibling_order_sequence) raise ValueError("Some orders of the sibling order sequence exceed the number of previous siblings.") max_children=parent_def.variable_range(child_name)[1] if len(sibling_order_sequence) != max_children: raise ValueError("The number of siblings implied by the sibling order sequence can not be different than the maximum number of children in the model.") #check marginalisation if len(gmm_full) != n_children: raise ValueError("the array defining which sibling order to train seperately should have the same length as the maximum amount of children for a given sibling order. \n length array: ",len(gmm_full),", expected: ",n_children) #do n_iter number of retrainings using previously best model #before iterating set the variable that will control whether a new model is an improvement iteration_gmm_score=score for iteration in range(n_iter): #find out the performance of the current model data,fitness_values=dg.training_data_generation(n_data,parent_def, parent_node,parent_var_names, child_name,sibling_var_names,n_children, fitness_funcs, sibling_data=sibling_data,poisson=poisson) if print_params.verbose_iter: model_evaluation.print_evaluation(fitness_values,iteration,summary=not print_params.print_fitness_bins) if model_evaluation.converged(fitness_values): return #combine fitness per func #evaluate model at the start of every iteration gmm_vars_retry_eval=[] #do n trials to find a better gmm for the model for trial in range(n_trial): #calculate all full joints gmms=[None]*n_children for child_index in np.where(gmm_full)[0]: #generate data for each number of children data,fitness_values=dg.training_data_generation(n_data,parent_def, parent_node,parent_var_names, child_name,sibling_var_names,child_index+1, fitness_funcs, sibling_data,poisson) gmm = GMM(n_components=n_components,random_state=setting_values.random_state) data,fitness_values=dtfr.filter_fitness_and_data_training(data,fitness_values, fitness_funcs) if regression: fitness_values=dtfr.apply_fitness_order(fitness_values,fitness_funcs) fitness_regression=dtfr.reduce_fitness_dimension(fitness_values,fitness_dim, dtfr.FitnessCombination.product) #renormalise fitness_regression=dtfr.normalise_fitness(fitness_regression) fitness_regression=[ list(ut.flatten(fn_value_line)) for fn_value_line in fitness_regression] #add fitness data train_data=np.column_stack((data,fitness_regression)) #for regression calculate full joint gmm.fit(train_data,infinite=False,min_covar=min_covar) indices,targets = dtfr.format_target_indices_for_regression_conditioning(data,fitness_values, fitness_funcs, fitness_dim) #condition on fitness gmm= gmm.condition(indices,targets) else: fitness_values=dtfr.apply_fitness_order(fitness_values,fitness_funcs) #reduce fitness to a single dimension fitness_single=dtfr.reduce_fitness_dimension(fitness_values,(dtfr.FitnessInstanceDim.single, dtfr.FitnessFuncDim.single), dtfr.FitnessCombination.product) #renormalise fitness_single=dtfr.normalise_fitness(fitness_single) gmm.fit(data,np.array(fitness_single)[:,0],infinite=False,min_covar=min_covar) gmms[child_index]=gmm #marginalise gmms, starting from the largest for child_index in reversed(range(n_children)): if not gmms[child_index]: gmms[child_index]=dtfr.marginalise_gmm(gmms,child_index,parent_vars,sibling_vars) gmm_var_name="test"+str(iteration) gmm_vars=_construct_gmm_vars(gmms,gmm_var_name,parent_def,parent_node,child_name, parent_var_names,sibling_var_names) #the gmms are ordered the same as the children #use sibling order sequence to assign gmms[i] to the a child with order i #assign variable child i with gmm min(i,sibling_order) for k in range(len(child_nodes)): child_nodes[k].set_learned_variable(gmm_vars[sibling_order_sequence[k]]) #evaluate new model score=model_evaluation.evaluate() gmm_vars_retry_eval.append((gmm_vars,score)) if print_params.verbose_trial: print() print("trial: ", trial," score: ",score) #put original vars back for i in range(len(child_nodes)): child_nodes[i].delete_learned_variable(gmm_var_name) #check which gmm performed best max_gmm_vars=None for gmm_vars,gmm_score in gmm_vars_retry_eval: if gmm_score>iteration_gmm_score: max_gmm_vars=gmm_vars iteration_gmm_score=gmm_score delta_score=iteration_gmm_score-startscore #if it is better as the previous iteration- #inject new variable #else print that training didn't help gmm_scores=[gmm_score for gmm_vars,gmm_score in gmm_vars_retry_eval ] print("iteration ",iteration, " trial score mean: ", np.mean(gmm_scores)," variance: ",np.var(gmm_scores)) if max_gmm_vars: print("improved selected with score: ",iteration_gmm_score ) for i in range(len(child_nodes)): child_nodes[i].set_learned_variable(max_gmm_vars[sibling_order_sequence[i]]) else: print("The did not improve over consecutive training iteration.") break if print_params.verbose_final_extra: print() print("final evaluation of fitness" ) data,fitness_values=dg.training_data_generation(n_model_eval_data,parent_def, parent_node,parent_var_names, child_name,sibling_var_names,n_children, fitness_funcs, sibling_data=sibling_data,poisson=False) model_evaluation.print_evaluation(fitness_values,-1,summary=not print_params.print_fitness_bins) print("score gain: ", str(delta_score)) if print_params.visual and max_gmm_vars: for gmm_var in max_gmm_vars: vis.draw_1D_2D_GMM_variable_sampling(gmm_var,training_params.title,training_params.extra_info) if print_params.print_parameters_set: print("fitness parameters,") for fitn in fitness_funcs: print(str(fitn)) print(",") print() print("model parameters") print("parent variables,",str(parent_var_names)) print("sibling variables,", str(sibling_var_names)) print() return delta_score
gmm = GMM(n_components=15) regression=False if regression: #filter data,fitness_values=dtfr.filter_fitness_and_data_training(data,fitness_values, fitness_funcs) #apply order fitness_values=dtfr.apply_fitness_order(fitness_values,fitness_funcs) #reduce fitness_regression=dtfr.reduce_fitness_dimension(fitness_values,fitness_dim, dtfr.FitnessCombination.product) #renormalise fitness_regression=dtfr.normalise_fitness(fitness_regression) fitness_regression=[ list(ut.flatten(fn_value_line)) for fn_value_line in fitness_regression] #add fitness data train_data=np.column_stack((data,fitness_regression)) gmm = GMM(n_components=5) #for regression calculate full joint gmm.fit(train_data,infinite=False,min_covar=0.01) indices,targets = dtfr.format_target_indices_for_regression_conditioning(data,fitness_values, fitness_funcs, fitness_dim) #condition on fitness gmm= gmm.condition(indices,targets)
def training(model, fitness_funcs, sibling_var_names, parent_var_names): sibling_order_sequence = training_params.sibling_order_sequence gmm_full = training_params.gmm_full sibling_data = training_params.sibling_data fitness_dim = training_params.sibling_data n_data = training_params.n_data poisson = training_params.poisson n_iter = training_params.n_iter n_trial = training_params.n_trial n_model_eval_data = training_params.n_model_eval_data n_components = training_params.n_components min_covar = training_params.min_covar regression = training_params.regression #experiment hyperparameters: fitness_average_threshhold = 0.95 fitness_func_threshhold = 0.98 #this sequence indicates the order of the markov chain between siblings [1,2]-> second child depends on the first #the third on the first and second #the first child is always independent sibling_data = dg.SiblingData.first fitness_dim = (dtfr.FitnessInstanceDim.seperate, dtfr.FitnessFuncDim.seperate) #the sibling order defines the size of the joint distribution that will be trained sibling_order = np.max(sibling_order_sequence) n_children = sibling_order + 1 #gmm marginalisation [order_1,order_2,..,order_sibling_order] #True->train full joint #False->derive (marginalise) from closest higher order child_name = "child" #model to train on parent_node, parent_def = model child_nodes = parent_node.children[child_name] #training variables and fitness functions #this expliicitly also defines the format of the data sibling_vars = [ parent_def.children[child_name].variables[name] for name in sibling_var_names ] parent_vars = [parent_def.variables[name] for name in parent_var_names] if not all(var.stochastic() for var in sibling_vars + parent_vars): non_stoch_var = [ var.name for var in sibling_vars + parent_vars if not var.stochastic() ] raise ValueError( "Only the distribution of stochastic variables can be trained on. The variables " + str(non_stoch_var) + "are not stochastic") #check if none of the vars is deterministic #this expliicitly also defines the format of the data #fitness func, order cap and regression target #fitness_funcs=[fn.Targetting_Fitness("Minimum distances",fn.min_dist_sb,fn.Fitness_Relation.pairwise_siblings,1,0,1,target=1),fn.Fitness("polygon overlap",fn.negative_overlap_pc,fn.Fitness_Relation.pairwise_parent_child,1,0,1)] #only the func order and cap is used for training model_evaluation = mev.ModelEvaluation(n_model_eval_data, parent_def, parent_node, parent_var_names, child_name, sibling_var_names, fitness_funcs, fitness_average_threshhold, fitness_func_threshhold) score = model_evaluation.evaluate() startscore = score delta_score = 0 print("score before training: ", score) #check sibling sequence wrong_sequence = any(sibling_order_sequence[i] > i for i in range(len(sibling_order_sequence))) if wrong_sequence: print(sibling_order_sequence) raise ValueError( "Some orders of the sibling order sequence exceed the number of previous siblings." ) max_children = parent_def.variable_range(child_name)[1] if len(sibling_order_sequence) != max_children: raise ValueError( "The number of siblings implied by the sibling order sequence can not be different than the maximum number of children in the model." ) #check marginalisation if len(gmm_full) != n_children: raise ValueError( "the array defining which sibling order to train seperately should have the same length as the maximum amount of children for a given sibling order. \n length array: ", len(gmm_full), ", expected: ", n_children) #do n_iter number of retrainings using previously best model #before iterating set the variable that will control whether a new model is an improvement iteration_gmm_score = score for iteration in range(n_iter): #find out the performance of the current model data, fitness_values = dg.training_data_generation( n_data, parent_def, parent_node, parent_var_names, child_name, sibling_var_names, n_children, fitness_funcs, sibling_data=sibling_data, poisson=poisson) if print_params.verbose_iter: model_evaluation.print_evaluation( fitness_values, iteration, summary=not print_params.print_fitness_bins) if model_evaluation.converged(fitness_values): return #combine fitness per func #evaluate model at the start of every iteration gmm_vars_retry_eval = [] #do n trials to find a better gmm for the model for trial in range(n_trial): #calculate all full joints gmms = [None] * n_children for child_index in np.where(gmm_full)[0]: #generate data for each number of children data, fitness_values = dg.training_data_generation( n_data, parent_def, parent_node, parent_var_names, child_name, sibling_var_names, child_index + 1, fitness_funcs, sibling_data, poisson) gmm = GMM(n_components=n_components, random_state=setting_values.random_state) data, fitness_values = dtfr.filter_fitness_and_data_training( data, fitness_values, fitness_funcs) if regression: fitness_values = dtfr.apply_fitness_order( fitness_values, fitness_funcs) fitness_regression = dtfr.reduce_fitness_dimension( fitness_values, fitness_dim, dtfr.FitnessCombination.product) #renormalise fitness_regression = dtfr.normalise_fitness( fitness_regression) fitness_regression = [ list(ut.flatten(fn_value_line)) for fn_value_line in fitness_regression ] #add fitness data train_data = np.column_stack((data, fitness_regression)) #for regression calculate full joint gmm.fit(train_data, infinite=False, min_covar=min_covar) indices, targets = dtfr.format_target_indices_for_regression_conditioning( data, fitness_values, fitness_funcs, fitness_dim) #condition on fitness gmm = gmm.condition(indices, targets) else: fitness_values = dtfr.apply_fitness_order( fitness_values, fitness_funcs) #reduce fitness to a single dimension fitness_single = dtfr.reduce_fitness_dimension( fitness_values, (dtfr.FitnessInstanceDim.single, dtfr.FitnessFuncDim.single), dtfr.FitnessCombination.product) #renormalise fitness_single = dtfr.normalise_fitness(fitness_single) gmm.fit(data, np.array(fitness_single)[:, 0], infinite=False, min_covar=min_covar) gmms[child_index] = gmm #marginalise gmms, starting from the largest for child_index in reversed(range(n_children)): if not gmms[child_index]: gmms[child_index] = dtfr.marginalise_gmm( gmms, child_index, parent_vars, sibling_vars) gmm_var_name = "test" + str(iteration) gmm_vars = _construct_gmm_vars(gmms, gmm_var_name, parent_def, parent_node, child_name, parent_var_names, sibling_var_names) #the gmms are ordered the same as the children #use sibling order sequence to assign gmms[i] to the a child with order i #assign variable child i with gmm min(i,sibling_order) for k in range(len(child_nodes)): child_nodes[k].set_learned_variable( gmm_vars[sibling_order_sequence[k]]) #evaluate new model score = model_evaluation.evaluate() gmm_vars_retry_eval.append((gmm_vars, score)) if print_params.verbose_trial: print() print("trial: ", trial, " score: ", score) #put original vars back for i in range(len(child_nodes)): child_nodes[i].delete_learned_variable(gmm_var_name) #check which gmm performed best max_gmm_vars = None for gmm_vars, gmm_score in gmm_vars_retry_eval: if gmm_score > iteration_gmm_score: max_gmm_vars = gmm_vars iteration_gmm_score = gmm_score delta_score = iteration_gmm_score - startscore #if it is better as the previous iteration- #inject new variable #else print that training didn't help gmm_scores = [gmm_score for gmm_vars, gmm_score in gmm_vars_retry_eval] print("iteration ", iteration, " trial score mean: ", np.mean(gmm_scores), " variance: ", np.var(gmm_scores)) if max_gmm_vars: print("improved selected with score: ", iteration_gmm_score) for i in range(len(child_nodes)): child_nodes[i].set_learned_variable( max_gmm_vars[sibling_order_sequence[i]]) else: print("The did not improve over consecutive training iteration.") break if print_params.verbose_final_extra: print() print("final evaluation of fitness") data, fitness_values = dg.training_data_generation( n_model_eval_data, parent_def, parent_node, parent_var_names, child_name, sibling_var_names, n_children, fitness_funcs, sibling_data=sibling_data, poisson=False) model_evaluation.print_evaluation( fitness_values, -1, summary=not print_params.print_fitness_bins) print("score gain: ", str(delta_score)) if print_params.visual and max_gmm_vars: for gmm_var in max_gmm_vars: vis.draw_1D_2D_GMM_variable_sampling(gmm_var, training_params.title, training_params.extra_info) if print_params.print_parameters_set: print("fitness parameters,") for fitn in fitness_funcs: print(str(fitn)) print(",") print() print("model parameters") print("parent variables,", str(parent_var_names)) print("sibling variables,", str(sibling_var_names)) print() return delta_score
def fitness_value_bounds(fitness_values): return [(np.min(list(ut.flatten(np.array(fitness_values)[:, i]))), np.max(list(ut.flatten(np.array(fitness_values)[:, i])))) for i in range(len(fitness_values[0]))]