def random_forest_measure_attributes(data, classify): ''' performs feature selection using random forests in orange. For more details see `orange ensemble <http://orange.biolab.si/doc/modules/orngEnsemble.htm>`_ :param data: data from :meth:`perform_experiments`. :param classify: function for classifying runs. :param nrOfTrees: number of trees in the forest (default: 100). :param attributes: Number of attributes used in a randomly drawn subset when searching for best attribute to split the node in tree growing. (default: None, and if kept this way, this is turned into square root of attributes in example set) :rtype: sorted list of tuples with uncertainty names and importance values. ''' data = build_orange_data(data, classify) #do the random forest #see http://orange.biolab.si/doc/modules/orngEnsemble.htm for details info("executing random forest for attribute selection") measure = orngEnsemble.MeasureAttribute_randomForests(trees=100) #calculate importance imps = measure.importances(data) #sort importance, using schwartzian transform results = [] for i,imp in enumerate(imps): results.append((imp, data.domain.attributes[i].name)) results.sort(reverse=True) results = [(entry[1], entry[0]) for entry in results] return results
def distance_sse(data): ''' The SSE (sum of squared-errors) distance between two data series is equal to the sum of squared-errors between corresponding data points of these two data series. Let the data series be of length N; Then SSE distance between ds1 and ds2 equals to the sum of the square of error terms from 1 to N, where error_term(i) equals to ds1(i)-ds2(i) Since SSE calculation is based on pairwise comparison of individual data points, the data series should be of equal length. SSE distance equals to the square of Euclidian distance, which is a commonly used distance metric in time series comparisons. ''' runLogs = [] #Generates the feature vectors for all the time series that are contained in numpy array data info("calculating distances") dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), )) index = -1 for i in range(data.shape[0]): # For each run, a log is created # Log includes a description dictionary that has key information # for post-clustering analysis, and the data series itself. These # logs are stored in a global array named runLogs behaviorDesc = {} behaviorDesc['Index'] = str(i) behavior = data[i] localLog = (behaviorDesc, behavior) runLogs.append(localLog) for j in range(i+1, data.shape[0]): index += 1 distance = ssedist(data[i],data[j]) dRow[index] = distance return dRow, runLogs
def determine_intersecting_uncertainties(self): #get the intersection of the uncertainties of the different models if len(self._modelStructures) >1: # this seems opaque... but the reason for doing it this way is # that the name alone is not enough for identity. The # ranges of the uncertainties should also be the same, hence # the identity function on the uncertainty. uncertainties = [] for msi in self._modelStructures: u = [uncertainty.identity() for uncertainty in msi.uncertainties] uncertainties.append(u) shared_uncertainties = set(uncertainties[0]).intersection(*uncertainties[1:]) # determine unshared unshared = {} for i, msi in enumerate(self._modelStructures): un = set(uncertainties[i]) - set(shared_uncertainties) a = {} for u in msi.uncertainties: a[u.name] = u u = [a.get(u[0]) for u in un] unshared[msi.name] = u a = {} for u in self._modelStructures[0].uncertainties: a[u.name] = u shared_uncertainties = [a.get(u[0]) for u in shared_uncertainties] info("intersection contains %s uncertainties" %len(shared_uncertainties)) else: shared_uncertainties = set(self._modelStructures[0].uncertainties) unshared = None return shared_uncertainties, unshared
def _get_population(self): if self._restart_required(): self.called +=1 self.last_eps_progress = 0 new_pop = self._rebuild_population() # update selection pressure... self.tournament_size = int(max(2, self.selection_presure*self.pop_size)) ema_logging.info(self.message.format(self.pop_size, len(self.archive.items), self.tournament_size)) # Evaluate the individuals with an invalid fitness self.evaluate_population(new_pop, self.reporting_interval, self.toolbox, self.ensemble) # Select the next generation population self.pop = self.toolbox.select(self.pop + new_pop, self.pop_size) self.stats_callback(self.pop) self.stats_callback.log_stats(self.called) return self.pop else: return super(epsNSGA2, self)._get_population()
def random_forest(data, classify, nrOfTrees=100, attributes=None): ''' make a random forest using orange For more details see `orange ensemble <http://orange.biolab.si/doc/modules/orngEnsemble.htm>`_ :param data: data from :meth:`perform_experiments`. :param classify: function for classifying runs. :param nrOfTrees: number of trees in the forest (default: 100). :param attributes: Number of attributes used in a randomly drawn subset when searching for best attribute to split the node in tree growing (default: None, and if kept this way, this is turned into square root of attributes in example set). :rtype: an orange random forest. ''' data = build_orange_data(data, classify) #do the random forest #see http://orange.biolab.si/doc/modules/orngEnsemble.htm for details info("executing random forest") measure = orngEnsemble.MeasureAttribute_randomForests(trees=nrOfTrees, attributes=attributes) return measure
def _generate_cases(self, nrOfCases): ''' number of cases specifies the number of cases to generate in case of Monte Carlo and Latin Hypercube sampling. In case of full factorial sampling it specifies the resolution on non categorical uncertainties. In case of multiple model structures, the uncertainties over which to explore is the intersection of the sets of uncertainties of the model interface instances. :param nrOfCases: In case of Latin Hypercube sampling and Monte Carlo sampling, nrOfCases specifies the number of cases to generate. In case of Full Factorial sampling, nrOfCases specifies the resolution to use for sampling continuous uncertainties. ''' shared_uncertainties, unshared = self.determine_intersecting_uncertainties() info("generating cases") shared_designs = self.sampler.generate_design(shared_uncertainties, nrOfCases) information = shared_designs[1] shared_designs = shared_designs[0] cases = [] for design in shared_designs: case = {} for i, name in enumerate(information): case[name] = design[i] cases.append(case) info(str(len(cases)) + " cases generated") return cases, shared_uncertainties
def perform_prim(x, y, box_init = None, peel_alpha = 0.05, paste_alpha = 0.05, mass_min = 0.05, threshold = None, pasting = False, threshold_type = 1, cases_of_interest = None, obj_func = None): if threshold==None: threshold = np.mean(y) n = y.shape[0] y = y * threshold_type k_max = np.ceil(1/mass_min) k_max = int(k_max) info("max number of boxes: %s" %(k_max)) if threshold_type==1: Prim.t_coi = np.sum(np.abs(y[(y) >= (threshold)])) else: Prim.t_coi = np.sum(np.abs(y[(y) <= (threshold)])) Prim.threshold = threshold Prim.threshold_type = threshold_type if box_init == None: box_init = make_box(x) Prim.box_init = box_init box_init = Prim(x, y, box_init, 1) else: #else, identify all points in initial box, rest is discarded logical = in_box(x, box_init) x = x[logical] y = y[logical] boxes = find_boxes(x, y, box_init, peel_alpha, paste_alpha, mass_min, threshold, pasting, 0, k_max, n, cases_of_interest, obj_func) # adjust for negative hdr for box in boxes: box.y = threshold_type*box.y box.y_mean = threshold_type*box.y_mean # the list of found boxes has the dump box as first element # we need to reverse the ordering to get the correct order in which # the boxes have been found boxes.reverse() boxes = prim_hdr(boxes, threshold, threshold_type, Prim.box_init) return boxes
def construct_features(data, trendThold, crisisThold): info("calculating features") # Checks the parameters of the distance function that may be defined by the user in the distanceSetup dict features = np.zeros(shape=(data.shape[0], 3)) for i in range(data.shape[0]): features[i,:] = construct_feature_vector(data[i, :], trendThold, crisisThold) return features
def do_text_ticks_labels(ax, i, j, field1, field2, ylabels, outcomes_to_show): ''' Helper function for setting the tick labels on the axes correctly on and of :param ax: :param i: :param j: :param field1: :param field2: :param ylabels: :param outcomes_to_show: ''' #text and labels if i == j: #only plot the name in the middle if ylabels: text = ylabels[field1] else: text = field1 ax.text(0.5, 0.5, text, horizontalalignment='center', verticalalignment='center', transform = ax.transAxes) # are we at the end of the row? if i != len(outcomes_to_show)-1: #xaxis off ax.set_xticklabels([]) else: if ylabels: try: ax.set_xlabel(ylabels.get(field2)) except KeyError: info("no label specified for "+field2) else: ax.set_xlabel(field2) # are we at the end of the column? if j != 0: #yaxis off ax.set_yticklabels([]) else: if ylabels: try: ax.set_ylabel(ylabels.get(field1)) except KeyError: info("no label specified for "+field1) else: ax.set_ylabel(field1)
def build_orange_data(data,classify): ''' helper function for turning the data from :meth:`perform_experiments` into a data object that can be used by the various orange functions. For more details see `orange domain <http://orange.biolab.si/doc/reference/Domain.htm>`_ :param data: return from :meth:`perform_experiments`. :param classify: function to be used for determining the class for each run. ''' info("building orange data") experiments, results = data #build domain dtypes = [] for entry in experiments.dtype.descr: dtypes.append((entry[0], experiments.dtype.fields.get(entry[0]))) attributes = [] for entry in dtypes: name, dtype = entry dtype = dtype[0].name if dtype == 'int' or dtype =='object': attribute = ENUM(name) [attribute.addValue(str(value)) for value in\ set(experiments[name].tolist())] else: attribute = FLOAT(name, startValue = np.min(experiments[name]), endValue = np.max(experiments[name])) attributes.append(attribute) data = np.array(experiments.tolist()) #determine classes classes = classify(results) classVar = ENUM('class') #these numbers are merely referring to the possible classes [classVar.addValue(str(i)) for i in set(classes.tolist())] #by default the last entry in the list should be the class variable attributes.append(classVar) domain = orange.Domain(attributes) data = np.hstack((data, classes[:, np.newaxis])) data = data.tolist() data = orange.ExampleTable(domain, data) return data
def test_perform_experiments(): # # let's make some interfaces # model_a = DummyInterface(None, "A") # model_b = DummyInterface(None, "B") # # # let's add some uncertainties to this # shared_ab_1 = ParameterUncertainty((0,1), "shared ab 1") # shared_ab_2 = ParameterUncertainty((0,10), "shared ab 1") # model_a.uncertainties = [shared_ab_1, shared_ab_2] # model_b.uncertainties = [shared_ab_1, shared_ab_2] # # ensemble = ModelEnsemble() # ensemble.add_model_structures([model_a, model_b]) # what are all the test cases? # test for error in case uncertainty by same name but different # in other respects # everything shared model_a = DummyInterface(None, "A") model_b = DummyInterface(None, "B") model_c = DummyInterface(None, "C") # let's add some uncertainties to this shared_abc_1 = ParameterUncertainty((0,1), "shared abc 1") shared_abc_2 = ParameterUncertainty((0,1), "shared abc 2") shared_ab_1 = ParameterUncertainty((0,1), "shared ab 1") shared_bc_1 = ParameterUncertainty((0,1), "shared bc 1") a_1 = ParameterUncertainty((0,1), "a 1") b_1 = ParameterUncertainty((0,1), "b 1") model_a.uncertainties = [shared_abc_1, shared_abc_2, shared_ab_1, a_1] model_b.uncertainties = [shared_abc_1, shared_abc_2, shared_ab_1, shared_bc_1, b_1] model_c.uncertainties = [shared_abc_1, shared_abc_2, shared_bc_1] #let's add an outcome to this outcome_shared = Outcome("test", time=True) model_a.outcomes = [outcome_shared] model_b.outcomes = [outcome_shared] model_c.outcomes = [outcome_shared] ensemble = ModelEnsemble() ensemble.parallel=True ensemble.add_model_structures([model_a, model_b, model_c]) ema_logging.info('------------- union of uncertainties -------------') results = ensemble.perform_experiments(10, which_uncertainties=UNION, reporting_interval=1 ) ema_logging.info('------------- intersection of uncertainties -------------') ensemble.perform_experiments(10, which_uncertainties=INTERSECTION, reporting_interval=1)
def make_data_structure(clusters, distRow, runLogs): nr_clusters = np.max(clusters) cluster_list = [] for i in range(1, nr_clusters+1): info("starting with cluster %s" %i) #determine the indices for cluster i indices = np.where(clusters==i)[0] drow_indices = np.zeros((indices.shape[0]**2-indices.shape[0])/2, dtype=int) s = 0 #get the indices for the distance for the runs in the cluster for q in range(indices.shape[0]): for r in range(q+1, indices.shape[0]): b = indices[q] a = indices[r] drow_indices[s] = get_drow_index(indices[r], indices[q], clusters.shape[0]) s+=1 #get the distance for the runs in the cluster dist_clust = distRow[drow_indices] #make a distance matrix dist_matrix = squareform(dist_clust) #sum across the rows row_sum = dist_matrix.sum(axis=0) #get the index of the result with the lowest sum of distances min_cIndex = row_sum.argmin() # convert this cluster specific index back to the overall cluster list # of indices originalIndices = np.where(clusters==i) originalIndex = originalIndices[0][min_cIndex] print originalIndex a = list(np.where(clusters==i)[0]) a = [int(entry) for entry in a] cluster = Cluster(i, np.where(clusters==i)[0], originalIndex, [runLogs[entry] for entry in a], dist_clust) cluster_list.append(cluster) return cluster_list
def log_stats(self, gen): functions = {"minima":self.minima, "maxima":self.maxima, "std":self.std, "mean":self.mean,} kargs = {} hof = self.__get_hof_in_array() line = " ".join("{%s:<8}" % name for name in sorted(functions.keys())) for name in sorted(functions.keys()): function = functions[name] kargs[name] = "[%s]" % ", ".join(map(self.precision.format, function(hof))) line = line.format(**kargs) line = "generation %s: " %gen + line ema_logging.info(line)
def perform_prim_specific(x, y, box_init = None, peel_alpha = 0.05, paste_alpha = 0.05, mass_min = 0.05, threshold = None, pasting = False, threshold_type = 1, cases_of_interest = None, obj_func = None): if threshold==None: threshold = np.mean(y) k_max = np.ceil(1/mass_min) k_max = int(k_max) info("max number of boxes: %s" %(k_max)) if box_init == None: box_init = make_box(x) else: #else, identify all points in initial box, rest is discarded logical = in_box(x, box_init) x = x[logical] y = y[logical] n = y.shape[0] y = y * threshold_type boxes = find_boxes(x, y, box_init, peel_alpha, paste_alpha, mass_min, np.min(y)-0.1*np.abs(np.min(y)), pasting, 0, k_max, n, cases_of_interest, obj_func) # adjust for negative hdr exps = [] for box in boxes: box.y = threshold_type*box.y box.y_mean = threshold_type*box.y_mean exps.append(box.x) # the list of found boxes has the dump box as first element # we need to reverse the ordering to get the correct order in which # the boxes have been found boxes.reverse() exps.reverse() boxes = prim_hdr(boxes, threshold, threshold_type) return boxes, exps
def filter_scalar_outcomes(outcomes): ''' Helper function that removes non time series outcomes from all the outcomes. :param outcomes: :return: the filtered outcomes ''' outcomes_to_remove = [] for key, value in outcomes.items(): if len(value.shape) <2: outcomes_to_remove.append(key) info("%s not shown because it is not time series data" %key) [outcomes.pop(entry) for entry in outcomes_to_remove] return outcomes
def feature_selection(data, classify, k=5, m=100): ''' perform feature selection using orange For more details see `orange feature selection <http://orange.biolab.si/doc/modules/orngFSS.htm>`_ and `orange measure attribute <http://orange.biolab.si/doc/reference/MeasureAttribute.htm>`_ the default measure is ReliefF ((MeasureAttribute_relief in Orange). :param data: data from :meth:`perform_experiments`. :param classify: function for classifying runs. :param k: the number of neighbors for each example (default 5). :param m: number of examples to use, Set to -1 to use all (default 100). :rtype: sorted list of tuples with uncertainty names and reliefF attribute scores. Orange provides other metrics for feature selection * Information Gain * Gain ratio * Gini index * Relevance of attributes * Costs If you want to use any of of these instead of ReliefF, use the code supplied here as a template, but modify the measure. That is replace:: measure = orange.MeasureAttribute_relief(k=k, m=m) with the measure of choice. See the above provided links for more details. ''' data = build_orange_data(data, classify) info("executing feature selection") measure = orange.MeasureAttribute_relief(k=k, m=m) ma = orngFSS.attMeasure(data, measure) results = [] for m in ma: results.append((m[1], m[0])) results.sort(reverse=True) results = [(entry[1], entry[0]) for entry in results] return results
def __call__(self, case, policy, name, result): ''' Method responsible for storing results. The implementation in this class only keeps track of how many runs have been completed and logging this. :param case: the case to be stored :param policy: the name of the policy being used :param name: the name of the model being used :param result: the result dict ''' self.i+=1 debug(str(self.i)+" cases completed") if self.i % self.reporting_interval == 0: info(str(self.i)+" cases completed")
def _run_optimization(self, generate_individual, evaluate_population,algorithm=None, obj_function=None, weights=None, levers=None, pop_size=None, reporting_interval=None, nr_of_generations=None, crossover_rate=None, mutation_rate=None, caching=False, **kwargs): ''' Helper function that runs the actual optimization :param toolbox: :param generate_individual: helper function for generating an individual :param evaluate_population: helper function for evaluating the population :param attr_list: list of attributes (alleles) :param keys: the names of the attributes in the same order as attr_list :param obj_function: the objective function :param pop_size: the size of the population :param reporting_interval: the interval for reporting progress, passed on to perform_experiments :param weights: the weights on the outcomes :param nr_of_generations: number of generations for which the GA will be run :param crossover_rate: the crossover rate of the GA :param mutation_rate: the muation rate of the GA :param levers: a dictionary with param keys as keys, and as values info used in mutation. ''' self.algorithm = algorithm(weights, levers, generate_individual, obj_function, pop_size, evaluate_population, nr_of_generations, crossover_rate, mutation_rate, reporting_interval, self, caching, **kwargs) # Begin the generational process for _ in range(nr_of_generations): pop = self.algorithm.get_population() info("-- End of (successful) evolution --") return self.algorithm.stats_callback, pop
def prim_hdr(prims, threshold, threshold_type): ''' Highest density region for PRIM boxes prim list of prim objects threshold threshold_type ''' n = 0 for entry in prims: n += entry.y.shape[0] info("number of items in boxes: %s" %n) boxes = [(entry.y_mean, entry) for entry in prims] final_list = [] dump_entries = [] for entry in boxes: if entry[0]*threshold_type >= threshold*threshold_type: final_list.append(entry[1]) else: dump_entries.append(entry[1]) x_temp = None for entry in dump_entries: if x_temp == None: x_temp = entry.x y_temp = entry.y else: x_temp = np.append(x_temp, entry.x, axis=0) y_temp = np.append(y_temp, entry.y, axis=0) dump_box = Prim(x_temp, y_temp, make_box(x_temp), y_temp.shape[0]/n) final_list.append(dump_box) return final_list
def __filter(boxes, uncertainties=[]): dump_box=boxes[-1] boxes=boxes[0:-1] uv=uncertainties #iterate over uncertainties names = [] if uncertainties: uv=uncertainties else: uv = [entry[0] for entry in dump_box.dtype.descr] for name in uv: #determine whether to show for box in boxes: minimum = box[name][0] maximum = box[name][1] value = box.dtype.fields.get(name)[0] if value == 'object': a = dump_box[name][0] if len(a) != len(minimum): ans = False else: ans = np.all(np.equal(a, minimum)) if not ans: names.append(name) break elif (minimum > dump_box[name][0]) or\ (maximum < dump_box[name][1]): names.append(name) break a = set(uv) -set(names) a = list(a) a.sort() string_list = ", ".join(a) info(string_list + " are not not visualized because they are not restricted") uv = names return uv
def construct_features(data, filterSlope, tHoldSlope, filterCurvature, tHoldCurvature, addMidExtension, addEndExtension): ''' Constructs a feature vector for each of the data-series contained in the data. ''' info("calculating features") # TODO, the casting of each feature to a list of tuples might be # removed at some stage, it will lead to a speed up, for you # can vectorize the calculations that use the feature vector features = [] for i in range(data.shape[0]): feature = construct_feature_vector(data[i, :], filterSlope, tHoldSlope, filterCurvature, tHoldCurvature, addMidExtension, addEndExtension) # feature = [tuple(feature[0,:]),tuple(feature[1,:])] features.append(feature) return features
def test_save_results(): # test for 1d # test for 2d # test for 3d # test for very large nr_experiments = 10000 experiments = np.recarray((nr_experiments,), dtype=[('x', float), ('y', float)]) outcome_a = np.random.rand(nr_experiments,1) results = (experiments, {'a': outcome_a}) save_results(results, r'../data/test.tar.gz') os.remove('../data/test.tar.gz') ema_logging.info('1d saved successfully') nr_experiments = 10000 nr_timesteps = 100 experiments = np.recarray((nr_experiments,), dtype=[('x', float), ('y', float)]) outcome_a = np.random.rand(nr_experiments,nr_timesteps) results = (experiments, {'a': outcome_a}) save_results(results, r'../data/test.tar.gz') os.remove('../data/test.tar.gz') ema_logging.info('2d saved successfully') nr_experiments = 10000 nr_timesteps = 100 nr_replications = 10 experiments = np.recarray((nr_experiments,), dtype=[('x', float), ('y', float)]) outcome_a = np.random.rand(nr_experiments,nr_timesteps,nr_replications) results = (experiments, {'a': outcome_a}) save_results(results, r'../data/test.tar.gz') os.remove('../data/test.tar.gz') ema_logging.info('3d saved successfully') nr_experiments = 500000 nr_timesteps = 100 experiments = np.recarray((nr_experiments,), dtype=[('x', float), ('y', float)]) outcome_a = np.random.rand(nr_experiments,nr_timesteps) results = (experiments, {'a': outcome_a}) save_results(results, r'../data/test.tar.gz') os.remove('../data/test.tar.gz') ema_logging.info('extremely long saved successfully')
def distance_triangle(data): ''' The triangle distance is calculated as follows; Let ds1(.) and ds2(.) be two data series of length N. Then; A equals to the summation of ds1(i).ds2(i) from i=1 to N B equals to the square-root of the (summation ds1(i)^2 from i=1 to N) C equals to the square-root of the (summation ds1(i)^2 from i=1 to N) distance_triangle = A/(B.C) The triangle distance works only with data series of the same length In the literature, it is claimed that the triangle distance can deal with noise and amplitude scaling very well, and may yield poor results in cases of offset translation and linear drift. ''' runLogs = [] #Generates the feature vectors for all the time series that are contained in numpy array data info("calculating distances") dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), )) index = -1 for i in range(data.shape[0]): # For each run, a log is created # Log includes a description dictionary that has key information # for post-clustering analysis, and the data series itself. These # logs are stored in a global array named runLogs behaviorDesc = {} behaviorDesc['Index'] = str(i) behavior = data[i] localLog = (behaviorDesc, behavior) runLogs.append(localLog) for j in range(i+1, data.shape[0]): index += 1 distance = trdist(data[i],data[j]) dRow[index] = distance return dRow, runLogs
def test_load_results(): # test for 1d # test for 2d # test for 3d # test for nd nr_experiments = 10000 experiments = np.recarray((nr_experiments,), dtype=[('x', float), ('y', float)]) outcome_a = np.random.rand(nr_experiments,1) results = (experiments, {'a': outcome_a}) save_results(results, r'../data/test.tar.gz') experiments, outcomes = load_results(r'../data/test.tar.gz') logical = np.allclose(outcomes['a'],outcome_a) os.remove('../data/test.tar.gz') if logical: ema_logging.info('1d loaded successfully') nr_experiments = 1000 nr_timesteps = 100 nr_replications = 10 experiments = np.recarray((nr_experiments,), dtype=[('x', float), ('y', float)]) outcome_a = np.random.rand(nr_experiments,nr_timesteps,nr_replications) results = (experiments, {'a': outcome_a}) save_results(results, r'../data/test.tar.gz') experiments, outcomes = load_results(r'../data/test.tar.gz') logical = np.allclose(outcomes['a'],outcome_a) os.remove('../data/test.tar.gz') if logical: ema_logging.info('3d loaded successfully')
def determine_time_dimension(outcomes): ''' :param outcomes: ''' time = None try: time = outcomes['TIME'] time = time[0, :] outcomes.pop('TIME') except KeyError: values = iter(outcomes.values()) for value in values: if len(value.shape)==2: time = np.arange(0, value.shape[1]) break if time==None: info("no time dimension found in results") return time, outcomes
def log_stats(self, gen): '''Log statistics on the progress of the evolution''' functions = {"min":self.minima, "max":self.maxima, "std":self.std, "mean":self.mean,} hof = self.__get_hof_in_array() info_message = pd.DataFrame(index=['min', 'max', 'mean', 'std'], columns=['obj_{}'.format(i) for i in range(hof.shape[1])]) for key, value in functions.iteritems(): data = value(hof) info_message.loc[key] = data # let pandas do the formatting for us, but remove the trailing info # on the size of the DataFrame message = info_message.__str__() message = message.split('\n')[0:-2] message = "\n".join(message) line = "\ngeneration {}\n{}".format(gen,message) ema_logging.info(line)
def distance_euclidian(data): ''' The Euclidian distance is equal to the square root of (the sum of squared-differences between corresponding dimensions of two N-dimensional vectors) (i.e. two data series of length N). Let the data series be of length N; Then Euclidian distance between ds1 and ds2 equals to sqrt(the sum of the square of error terms from 1 to N), where error_term(i) equals to ds1(i)-ds2(i) ''' runLogs = [] #Generates the feature vectors for all the time series that are contained in numpy array data info("calculating distances") dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), )) index = -1 for i in range(data.shape[0]): # For each run, a log is created # Log includes a description dictionary that has key information # for post-clustering analysis, and the data series itself. These # logs are stored in a global array named runLogs behaviorDesc = {} behaviorDesc['Index'] = str(i) behavior = data[i] localLog = (behaviorDesc, behavior) runLogs.append(localLog) for j in range(i+1, data.shape[0]): index += 1 distance = eucldist(data[i],data[j]) dRow[index] = distance return dRow, runLogs
def distance_mse(data): ''' The MSE (mean squared-error) distance is equal to the SSE distance divided by the number of data points in data series. The SSE distance between two data series is equal to the sum of squared-errors between corresponding data points of these two data series. Let the data series be of length N; Then SSE distance between ds1 and ds2 equals to the sum of the square of error terms from 1 to N, where error_term(i) equals to ds1(i)-ds2(i) Given that SSE is calculated as given above, MSE equals SSE divided by N. As SSE distance, the MSE distance only works with data series of equal length. ''' runLogs = [] #Generates the feature vectors for all the time series that are contained in numpy array data info("calculating distances") dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), )) index = -1 for i in range(data.shape[0]): # For each run, a log is created # Log includes a description dictionary that has key information # for post-clustering analysis, and the data series itself. These # logs are stored in a global array named runLogs behaviorDesc = {} behaviorDesc['Index'] = str(i) behavior = data[i] localLog = (behaviorDesc, behavior) runLogs.append(localLog) for j in range(i+1, data.shape[0]): index += 1 distance = msedist(data[i],data[j]) dRow[index] = distance return dRow, runLogs
def distance_gonenc(data, sisterCount=50, wSlopeError=1, wCurvatureError=1, filterSlope=True, tHoldSlope = 0.1, filterCurvature=True, tHoldCurvature=0.1, addMidExtension=True, addEndExtension=True ): ''' The distance measures the proximity of data series in terms of their qualitative pattern features. In order words, it quantifies the proximity between two different dynamic behaviour modes. It is designed to work mainly on non-stationary data. It's current version does not perform well in catching the proximity of two cyclic/repetitive patterns with different number of cycles (e.g. oscillation with 4 cycle versus oscillation with 6 cycles). :param sisterCount: Number of long-versions that will be created for the short vector while comparing two data series with unequal feature vector lengths. :param wSlopeError: Weight of the error between the 1st dimensions of the two feature vectors (i.e. Slope). :param wCurvatureError: Weight of the error between the 2nd dimensions of the two feature vectors (i.e. Curvature). :param wFilterSlope: Whether the slope vectors should be filtered for minor fluctuations, or not. :param tHoldSlope: The threshold value to be used in filtering out fluctuations in the slope. :param wFilterCurvature: Whether the curvature vectors should be filtered for minor fluctuations, or not. :param tHoldCurvature: The threshold value to be used in filtering out fluctuations in the curvature. :param addMidExtension: Whether the feature vectors should be extended by introducing transition sections along the vector :param addEndExtension: Whether the feature vectors should be extended by introducing startup/closing sections at the beginning/end of the vector. ''' runLogs = [] #Generates the feature vectors for all the time series that are contained # in numpy array data features = construct_features(data, filterSlope, tHoldSlope, filterCurvature, tHoldCurvature, addMidExtension, addEndExtension) info("calculating distances") dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), )) index = -1 for i in range(data.shape[0]): feature_i = features[i] # For each run, a log is created # Log includes a description dictionary that has key information # for post-clustering analysis, and the data series itself. These # logs are stored in a global array named runLogs behaviorDesc = {} behaviorDesc['Index'] = str(i) #this may not work due to data type mismatch featVector = feature_i behaviorDesc['Feature vector'] = str(featVector) behavior = data[i] localLog = (behaviorDesc, behavior) runLogs.append(localLog) for j in range(i+1, data.shape[0]): index += 1 feature_j = features[j] if feature_i.shape[1] == feature_j.shape[1]: distance = distance_same_length(feature_i, feature_j, wSlopeError, wCurvatureError) else: distance = distance_different_lenght(feature_i, feature_j, wSlopeError, wCurvatureError, sisterCount) dRow[index] = distance return dRow, runLogs
def perform_loop_knockout(): unique_edges = [['In Goods', 'lost'], ['loss unprofitable extraction capacity', 'decommissioning extraction capacity'], ['production', 'In Goods'], ['production', 'lost'], ['production', 'Supply'], ['Real Annual Demand', 'substitution losses'], ['Real Annual Demand', 'price elasticity of demand losses'], ['Real Annual Demand', 'desired extraction capacity'], ['Real Annual Demand', 'economic demand growth'], ['average recycling cost', 'relative market price'], ['recycling fraction', 'lost'], ['commissioning recycling capacity', 'Recycling Capacity Under Construction'], ['maximum amount recyclable', 'recycling fraction'], ['profitability recycling', 'planned recycling capacity'], ['relative market price', 'price elasticity of demand losses'], ['constrained desired recycling capacity', 'gap between desired and constrained recycling capacity'], ['profitability extraction', 'planned extraction capacity'], ['commissioning extraction capacity', 'Extraction Capacity Under Construction'], ['desired recycling', 'gap between desired and constrained recycling capacity'], ['Installed Recycling Capacity', 'decommissioning recycling capacity'], ['Installed Recycling Capacity', 'loss unprofitable recycling capacity'], ['average extraction costs', 'profitability extraction'], ['average extraction costs', 'relative attractiveness recycling']] unique_cons_edges = [['recycling', 'recycling'], ['recycling', 'supply demand ratio'], ['decommissioning recycling capacity', 'recycling fraction'], ['returns to scale', 'relative attractiveness recycling'], ['shortage price effect', 'relative price last year'], ['shortage price effect', 'profitability extraction'], ['loss unprofitable extraction capacity', 'loss unprofitable extraction capacity'], ['production', 'recycling fraction'], ['production', 'constrained desired recycling capacity'], ['production', 'new cumulatively recycled'], ['production', 'effective fraction recycled of supplied'], ['loss unprofitable recycling capacity', 'recycling fraction'], ['average recycling cost', 'loss unprofitable recycling capacity'], ['recycling fraction', 'new cumulatively recycled'], ['substitution losses', 'supply demand ratio'], ['Installed Extraction Capacity', 'Extraction Capacity Under Construction'], ['Installed Extraction Capacity', 'commissioning extraction capacity'], ['Installed Recycling Capacity', 'Recycling Capacity Under Construction'], ['Installed Recycling Capacity', 'commissioning recycling capacity'], ['average extraction costs', 'profitability extraction']] # CONSTRUCTING THE ENSEMBLE AND SAVING THE RESULTS ema_logging.log_to_stderr(ema_logging.INFO) results = load_results(r'base.cPickle') # GETTING OUT THOSE BEHAVIOURS AND EXPERIMENT SETTINGS # Indices of a number of examples, these will be looked at. runs = [526,781,911,988,10,780,740,943,573,991] VOI = 'relative market price' results_of_interest = experiment_settings(results,runs,VOI) cases_of_interest = experiments_to_cases(results_of_interest[0]) behaviour_int = results_of_interest[1][VOI] # CONSTRUCTING INTERVALS OF ATOMIC BEHAVIOUR PATTERNS ints = intervals(behaviour_int,False) # GETTING OUT ONLY THOSE OF MAXIMUM LENGTH PER BEHAVIOUR max_intervals = intervals_interest(ints) # THIS HAS TO DO WITH THE MODEL FORMULATION OF THE SWITCHES/VALUES double_list = [6,9,11,17,19] indCons = len(unique_edges) # for elem in unique_cons_edges: # unique_edges.append(elem) current = os.getcwd() for beh_no in range(0,10): # beh_no = 0 # Varies between 0 and 9, index style. interval = max_intervals[beh_no] rmp = behaviour_int[beh_no] # rmp = rmp[interval[0]:interval[1]] x = range(0,len(rmp)) fig = plt.figure() ax = fig.add_subplot(111) vensim.be_quiet() # for loop_index in range(7,8): for loop_index in range(1,len(unique_edges)+1): if loop_index-indCons > 0: model_location = current + r'\Models\Consecutive\Metals EMA.vpm' elif loop_index == 0: model_location = current + r'\Models\Base\Metals EMA.vpm' else: model_location = current + r'\Models\Switches\Metals EMA.vpm' serie = run_interval(model_location,loop_index, interval,'relative market price', unique_edges,indCons,double_list, cases_of_interest[beh_no]) if serie.shape != rmp.shape: ema_logging.info('Loop %s created a floating point error' % (loop_index)) ema_logging.info('Caused by trying to switch %s' % (unique_edges[loop_index-1])) if serie.shape == rmp.shape: ax.plot(x,serie,'b') # data = np.zeros(rmp.shape[0]) # data[0:serie.shape[0]] = serie # ax.plot(x,data,'b') ax.plot(x,rmp,'r') ax.axvspan(interval[0]-1,interval[1], facecolor='lightgrey', alpha=0.5) f_name = 'switched unique edges only'+str(beh_no) plt.savefig(f_name)