示例#1
0
def random_forest_measure_attributes(data, classify):
    '''
    performs feature selection using random forests in orange.
    
    For more details see `orange ensemble <http://orange.biolab.si/doc/modules/orngEnsemble.htm>`_
    
    :param data: data from :meth:`perform_experiments`.
    :param classify: function for classifying runs.
    :param nrOfTrees: number of trees in the forest (default: 100).
    :param attributes: Number of attributes used in a randomly drawn subset 
                       when searching for best attribute to split the node in 
                       tree growing. (default: None, and if kept this way, this 
                       is turned into square root of attributes in example set)
    :rtype: sorted list of tuples with uncertainty names and importance values.
    
    '''
    data = build_orange_data(data, classify)
    
    #do the random forest
    #see http://orange.biolab.si/doc/modules/orngEnsemble.htm for details
    info("executing random forest for attribute selection")
    measure = orngEnsemble.MeasureAttribute_randomForests(trees=100)
    
    #calculate importance
    imps = measure.importances(data)
    
    #sort importance, using schwartzian transform
    results = [] 
    for i,imp in enumerate(imps): 
        results.append((imp, data.domain.attributes[i].name))
    results.sort(reverse=True)
    
    results = [(entry[1], entry[0]) for entry in results]
    return results
示例#2
0
def distance_sse(data):
    
    '''
    The SSE (sum of squared-errors) distance between two data series is equal to the sum of squared-errors between corresponding data points of these two data series.
    Let the data series be of length N; Then SSE distance between ds1 and ds2 equals to the sum of the square of error terms from 1 to N, 
    where error_term(i) equals to ds1(i)-ds2(i) 
    
    Since SSE calculation is based on pairwise comparison of individual data points, the data series should be of equal length.
    
    SSE distance equals to the square of Euclidian distance, which is a commonly used distance metric in time series comparisons.
    '''
    
    runLogs = []
    #Generates the feature vectors for all the time series that are contained in numpy array data
    info("calculating distances")
    dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), ))
    index = -1
    for i in range(data.shape[0]):
            
        # For each run, a log is created
        # Log includes a description dictionary that has key information 
        # for post-clustering analysis, and the data series itself. These 
        # logs are stored in a global array named runLogs
        behaviorDesc = {}
        behaviorDesc['Index'] = str(i)
        
        behavior = data[i]
        localLog = (behaviorDesc, behavior)
        runLogs.append(localLog)
    
        for j in range(i+1, data.shape[0]):
            index += 1
            distance = ssedist(data[i],data[j]) 
            dRow[index] = distance
    return dRow, runLogs
示例#3
0
 def determine_intersecting_uncertainties(self):
     #get the intersection of the uncertainties of the different models
     if len(self._modelStructures)  >1:
         # this seems opaque... but the reason for doing it this way is
         # that the name alone is not enough for identity. The 
         # ranges of the uncertainties should also be the same, hence
         # the identity function on the uncertainty. 
         
         uncertainties = []
         for msi in self._modelStructures:
             u = [uncertainty.identity() for uncertainty in msi.uncertainties]
             uncertainties.append(u)
         shared_uncertainties = set(uncertainties[0]).intersection(*uncertainties[1:])
         
         # determine unshared
         unshared = {}
         for i, msi in enumerate(self._modelStructures):
             un = set(uncertainties[i]) - set(shared_uncertainties)
             a = {}
             for u in msi.uncertainties:
                 a[u.name] = u
             u = [a.get(u[0]) for u in un]
             unshared[msi.name] = u 
         
         a = {}
         for u in self._modelStructures[0].uncertainties:
             a[u.name] = u
         shared_uncertainties = [a.get(u[0]) for u in shared_uncertainties]
         info("intersection contains %s uncertainties" %len(shared_uncertainties))
     else:
         shared_uncertainties = set(self._modelStructures[0].uncertainties)
         unshared = None
     
     return shared_uncertainties, unshared   
示例#4
0
    def _get_population(self):
        
        if self._restart_required():
            self.called +=1
            self.last_eps_progress = 0
            new_pop = self._rebuild_population()
        
            # update selection pressure...
            self.tournament_size = int(max(2,
                                        self.selection_presure*self.pop_size))
            ema_logging.info(self.message.format(self.pop_size,
                                                 len(self.archive.items),
                                                 self.tournament_size))

            # Evaluate the individuals with an invalid fitness
            self.evaluate_population(new_pop, self.reporting_interval, 
                                     self.toolbox, self.ensemble)
    
            # Select the next generation population
            self.pop = self.toolbox.select(self.pop + new_pop, self.pop_size)
            self.stats_callback(self.pop)
            self.stats_callback.log_stats(self.called)
            
            return self.pop
        else:
            return super(epsNSGA2, self)._get_population()
示例#5
0
def random_forest(data, classify, nrOfTrees=100, attributes=None):
    '''
    make a random forest using orange
    
    For more details see `orange ensemble <http://orange.biolab.si/doc/modules/orngEnsemble.htm>`_
    
    :param data: data from :meth:`perform_experiments`.
    :param classify: function for classifying runs.
    :param nrOfTrees: number of trees in the forest (default: 100).
    :param attributes: Number of attributes used in a randomly drawn subset 
                       when searching for best attribute to split the node in 
                       tree growing (default: None, and if kept this way, this 
                       is turned into square root of attributes in 
                       example set).
    :rtype: an orange random forest.
    
    '''
    data = build_orange_data(data, classify)
    
    #do the random forest
    #see http://orange.biolab.si/doc/modules/orngEnsemble.htm for details
    info("executing random forest")
    measure = orngEnsemble.MeasureAttribute_randomForests(trees=nrOfTrees, 
                                                        attributes=attributes)
    
    return measure
示例#6
0
 def _generate_cases(self, nrOfCases):
     '''
     number of cases specifies the number of cases to generate in case
     of Monte Carlo and Latin Hypercube sampling.
     
     In case of full factorial sampling it specifies the resolution on
     non categorical uncertainties.
     
     In case of multiple model structures, the uncertainties over
     which to explore is the intersection of the sets of uncertainties of
     the model interface instances.
     
     :param nrOfCases: In case of Latin Hypercube sampling and Monte Carlo 
                       sampling, nrOfCases specifies the number of cases to
                       generate. In case of Full Factorial sampling,
                       nrOfCases specifies the resolution to use for sampling
                       continuous uncertainties.
     
     '''
     shared_uncertainties, unshared = self.determine_intersecting_uncertainties()
      
     info("generating cases")
     shared_designs = self.sampler.generate_design(shared_uncertainties, nrOfCases)
     information = shared_designs[1]
     shared_designs = shared_designs[0]
     cases = []
     for design in shared_designs:
         case = {}
         for i, name in enumerate(information):
             case[name] = design[i]
         cases.append(case)
     
     info(str(len(cases)) + " cases generated")
     
     return cases, shared_uncertainties
示例#7
0
def perform_prim(x,
                 y,
                 box_init = None,
                 peel_alpha = 0.05,
                 paste_alpha = 0.05,
                 mass_min = 0.05,
                 threshold = None,
                 pasting = False,
                 threshold_type = 1,
                 cases_of_interest = None,
                 obj_func = None):
    if threshold==None:
        threshold = np.mean(y)

    n = y.shape[0]
    y = y * threshold_type
   
    k_max = np.ceil(1/mass_min)
    k_max = int(k_max)
    info("max number of boxes: %s" %(k_max))
    
    if threshold_type==1:
        Prim.t_coi  = np.sum(np.abs(y[(y) >= (threshold)]))
    else:
        Prim.t_coi  = np.sum(np.abs(y[(y) <= (threshold)]))
    Prim.threshold = threshold
    Prim.threshold_type = threshold_type
    
    if box_init == None:
        box_init = make_box(x)
        Prim.box_init = box_init
        box_init = Prim(x, y, box_init, 1)
    else:
        #else, identify all points in initial box, rest is discarded
        logical =  in_box(x, box_init)
        x = x[logical]
        y = y[logical]
    

    
    boxes = find_boxes(x, y, box_init, 
                       peel_alpha, paste_alpha, mass_min, 
                       threshold, 
                       pasting, 0, k_max, n, cases_of_interest, obj_func)
    
    # adjust for negative hdr  
    for box in boxes:
        box.y = threshold_type*box.y
        box.y_mean = threshold_type*box.y_mean

    # the list of found boxes has the dump box as first element
    # we need to reverse the ordering to get the correct order in which
    # the boxes have been found
    boxes.reverse()
    boxes = prim_hdr(boxes, threshold, threshold_type, Prim.box_init)
    
    return boxes
示例#8
0
def construct_features(data, 
                       trendThold, 
                       crisisThold):
    info("calculating features")
    
    # Checks the parameters of the distance function that may be defined by the user in the distanceSetup dict
    
    
    features = np.zeros(shape=(data.shape[0], 3))
    for i in range(data.shape[0]):
        features[i,:] = construct_feature_vector(data[i, :], trendThold, crisisThold)
    return features
示例#9
0
def do_text_ticks_labels(ax, i, j, field1, field2, ylabels, outcomes_to_show):
    '''
    
    Helper function for setting the tick labels on the axes correctly on and of
    
    :param ax:
    :param i:
    :param j:
    :param field1:
    :param field2:
    :param ylabels:
    :param outcomes_to_show:
    
    
    '''
    
    #text and labels
    if i == j:
        #only plot the name in the middle
        if ylabels:
            text = ylabels[field1]
        else:
            text = field1
        ax.text(0.5, 0.5, text,
                horizontalalignment='center',
                verticalalignment='center',
                transform = ax.transAxes)  
    
    # are we at the end of the row?
    if i != len(outcomes_to_show)-1:
        #xaxis off
        ax.set_xticklabels([])
    else:
        if ylabels:
            try:
                ax.set_xlabel(ylabels.get(field2))
            except KeyError:
                info("no label specified for "+field2)
        else:
            ax.set_xlabel(field2) 
    
    # are we at the end of the column?
    if j != 0:
        #yaxis off
        ax.set_yticklabels([])
    else:
        if ylabels:
            try:
                ax.set_ylabel(ylabels.get(field1))
            except KeyError:
                info("no label specified for "+field1) 
        else:
            ax.set_ylabel(field1)   
示例#10
0
def build_orange_data(data,classify):
    '''
    
    helper function for turning the data from :meth:`perform_experiments` into 
    a data object that can be used by the various orange functions. 
    
    For more details see `orange domain <http://orange.biolab.si/doc/reference/Domain.htm>`_  
    
    :param data: return from :meth:`perform_experiments`.
    :param classify: function to be used for determining the class for each 
                     run.
    
    '''
    info("building orange data")
    
    experiments, results = data

    #build domain
    dtypes =  []
    for entry in experiments.dtype.descr:
        dtypes.append((entry[0], experiments.dtype.fields.get(entry[0])))
    
    attributes = []
    for entry in dtypes:
        name, dtype = entry
        dtype = dtype[0].name
        if dtype == 'int' or dtype =='object':
            attribute = ENUM(name)
            [attribute.addValue(str(value)) for value in\
                                            set(experiments[name].tolist())]
        else:
            attribute = FLOAT(name, startValue = np.min(experiments[name]), 
                              endValue = np.max(experiments[name]))
        attributes.append(attribute)

    data = np.array(experiments.tolist())
        
    #determine classes
    classes = classify(results)
    classVar = ENUM('class')
    #these numbers are merely referring to the possible classes
    [classVar.addValue(str(i)) for i in set(classes.tolist())] 
    #by default the last entry in the list should be the class variable
    attributes.append(classVar) 
    domain = orange.Domain(attributes)
    
    data = np.hstack((data, classes[:, np.newaxis]))
    data = data.tolist()
    data = orange.ExampleTable(domain, data)

    return data
示例#11
0
def test_perform_experiments():
#    # let's make some interfaces
#    model_a = DummyInterface(None, "A")
#    model_b = DummyInterface(None, "B")
#    
#    # let's add some uncertainties to this
#    shared_ab_1 = ParameterUncertainty((0,1), "shared ab 1")
#    shared_ab_2 = ParameterUncertainty((0,10), "shared ab 1")
#    model_a.uncertainties = [shared_ab_1, shared_ab_2]
#    model_b.uncertainties = [shared_ab_1, shared_ab_2]
#    
#    ensemble = ModelEnsemble()
#    ensemble.add_model_structures([model_a, model_b])
    
    # what are all the test cases?
    # test for error in case uncertainty by same name but different 
    # in other respects

    
    # everything shared
    model_a = DummyInterface(None, "A")
    model_b = DummyInterface(None, "B")
    model_c = DummyInterface(None, "C")
    
    # let's add some uncertainties to this
    shared_abc_1 = ParameterUncertainty((0,1), "shared abc 1")
    shared_abc_2 = ParameterUncertainty((0,1), "shared abc 2")
    shared_ab_1 = ParameterUncertainty((0,1), "shared ab 1")
    shared_bc_1 = ParameterUncertainty((0,1), "shared bc 1")
    a_1 = ParameterUncertainty((0,1), "a 1")
    b_1 = ParameterUncertainty((0,1), "b 1")
    model_a.uncertainties = [shared_abc_1, shared_abc_2, shared_ab_1, a_1]
    model_b.uncertainties = [shared_abc_1, shared_abc_2, shared_ab_1, shared_bc_1, b_1]
    model_c.uncertainties = [shared_abc_1, shared_abc_2, shared_bc_1]
    
    #let's add an outcome to this
    outcome_shared = Outcome("test", time=True)
    model_a.outcomes = [outcome_shared]
    model_b.outcomes = [outcome_shared]
    model_c.outcomes = [outcome_shared]
    
    ensemble = ModelEnsemble()
    ensemble.parallel=True
    ensemble.add_model_structures([model_a, model_b, model_c])
    
    ema_logging.info('------------- union of uncertainties -------------')
    
    results = ensemble.perform_experiments(10, which_uncertainties=UNION, reporting_interval=1 )
    
    ema_logging.info('------------- intersection of uncertainties -------------')
    ensemble.perform_experiments(10, which_uncertainties=INTERSECTION, reporting_interval=1)
示例#12
0
def make_data_structure(clusters, distRow, runLogs):
    nr_clusters = np.max(clusters)
    cluster_list = []
    for i in range(1, nr_clusters+1):
        info("starting with cluster %s" %i)
        #determine the indices for cluster i
        indices = np.where(clusters==i)[0]
        
        drow_indices = np.zeros((indices.shape[0]**2-indices.shape[0])/2, dtype=int)
        s = 0
        #get the indices for the distance for the runs in the cluster
        for q in range(indices.shape[0]):
            for r in range(q+1, indices.shape[0]):
                b = indices[q]
                a = indices[r]
                
                drow_indices[s] = get_drow_index(indices[r],
                                                 indices[q], 
                                                 clusters.shape[0])
                s+=1
        
        #get the distance for the runs in the cluster
        dist_clust = distRow[drow_indices]
        
        #make a distance matrix
        dist_matrix = squareform(dist_clust)

        #sum across the rows
        row_sum = dist_matrix.sum(axis=0)
        
        #get the index of the result with the lowest sum of distances
        min_cIndex = row_sum.argmin()
    
        # convert this cluster specific index back to the overall cluster list 
        # of indices
        originalIndices = np.where(clusters==i)
        originalIndex = originalIndices[0][min_cIndex]

        print originalIndex

        a = list(np.where(clusters==i)[0])
        a = [int(entry) for entry in a]
        
        cluster = Cluster(i, 
                          np.where(clusters==i)[0], 
                          originalIndex,
                          [runLogs[entry] for entry in a],
                          dist_clust)
        cluster_list.append(cluster)
    return cluster_list
示例#13
0
 def log_stats(self, gen):
     functions = {"minima":self.minima,
                  "maxima":self.maxima,
                  "std":self.std,
                  "mean":self.mean,}
     kargs = {}
     hof = self.__get_hof_in_array()
     line = " ".join("{%s:<8}" % name for name in sorted(functions.keys()))
     
     for name  in sorted(functions.keys()):
         function = functions[name]
         kargs[name] = "[%s]" % ", ".join(map(self.precision.format, function(hof)))
     line = line.format(**kargs)
     line = "generation %s: " %gen + line
     ema_logging.info(line)
def perform_prim_specific(x,
                 y,
                 box_init = None,
                 peel_alpha = 0.05,
                 paste_alpha = 0.05,
                 mass_min = 0.05,
                 threshold = None,
                 pasting = False,
                 threshold_type = 1,
                 cases_of_interest = None,
                 obj_func = None):
    if threshold==None:
        threshold = np.mean(y)
   
    k_max = np.ceil(1/mass_min)
    k_max = int(k_max)
    info("max number of boxes: %s" %(k_max))
    
    if box_init == None:
        box_init = make_box(x)
    else:
        #else, identify all points in initial box, rest is discarded
        logical =  in_box(x, box_init)
        x = x[logical]
        y = y[logical]

    n = y.shape[0]
    y = y * threshold_type
    
    boxes = find_boxes(x, y, box_init, 
                       peel_alpha, paste_alpha, mass_min, 
                       np.min(y)-0.1*np.abs(np.min(y)), 
                       pasting, 0, k_max, n, cases_of_interest, obj_func)
    
    # adjust for negative hdr  
    exps = []
    for box in boxes:
        box.y = threshold_type*box.y
        box.y_mean = threshold_type*box.y_mean
        exps.append(box.x)
    # the list of found boxes has the dump box as first element
    # we need to reverse the ordering to get the correct order in which
    # the boxes have been found
    boxes.reverse()
    exps.reverse()
    boxes = prim_hdr(boxes, threshold, threshold_type)
    
    return boxes, exps
示例#15
0
def filter_scalar_outcomes(outcomes):
    '''
    Helper function that removes non time series outcomes from all the 
    outcomes.
    
    :param outcomes:
    :return: the filtered outcomes
    
    
    '''
    outcomes_to_remove = []
    for key, value in outcomes.items():
        if len(value.shape) <2:
            outcomes_to_remove.append(key)
            info("%s not shown because it is not time series data" %key)
    [outcomes.pop(entry) for entry in outcomes_to_remove]
    return outcomes
示例#16
0
def feature_selection(data, classify, k=5, m=100):
    '''
    
    perform feature selection using orange
    
    For more details see `orange feature selection <http://orange.biolab.si/doc/modules/orngFSS.htm>`_ and
    `orange measure attribute <http://orange.biolab.si/doc/reference/MeasureAttribute.htm>`_
    
    the default measure is ReliefF ((MeasureAttribute_relief in Orange).
    
    :param data: data from :meth:`perform_experiments`.
    :param classify: function for classifying runs.
    :param k: the number of neighbors for each example (default 5).
    :param m: number of examples to use, Set to -1 to use all (default 100).
    :rtype: sorted list of tuples with uncertainty names and reliefF attribute 
            scores.
    
    Orange provides other metrics for feature selection
    
    * Information Gain
    * Gain ratio 
    * Gini index 
    * Relevance of attributes 
    * Costs
    
    If you want to use any of of these instead of ReliefF, use the code
    supplied here as a template, but modify the measure. That is replace::
    
        measure = orange.MeasureAttribute_relief(k=k, m=m)
        
    with the measure of choice. See the above provided links for more details.
    
    '''
    data = build_orange_data(data, classify)

    info("executing feature selection")
    measure = orange.MeasureAttribute_relief(k=k, m=m)
    ma = orngFSS.attMeasure(data, measure)
    
    results = [] 
    for m in ma:
        results.append((m[1], m[0]))
    results.sort(reverse=True)
    
    results = [(entry[1], entry[0]) for entry in results]
    return results
示例#17
0
 def __call__(self, case, policy, name, result):
     '''
     Method responsible for storing results. The implementation in this
     class only keeps track of how many runs have been completed and 
     logging this. 
     
     :param case: the case to be stored
     :param policy: the name of the policy being used
     :param name: the name of the model being used
     :param result: the result dict
     
     '''
     
     self.i+=1
     debug(str(self.i)+" cases completed")
     
     if self.i % self.reporting_interval == 0:
         info(str(self.i)+" cases completed")
示例#18
0
    def _run_optimization(self, generate_individual, 
                           evaluate_population,algorithm=None, 
                           obj_function=None,
                           weights=None, levers=None, 
                           pop_size=None, reporting_interval=None, 
                           nr_of_generations=None, crossover_rate=None, 
                           mutation_rate=None,
                           caching=False,
                           **kwargs):
        '''
        Helper function that runs the actual optimization
                
        :param toolbox: 
        :param generate_individual: helper function for generating an 
                                    individual
        :param evaluate_population: helper function for evaluating the 
                                    population
        :param attr_list: list of attributes (alleles)
        :param keys: the names of the attributes in the same order as attr_list
        :param obj_function: the objective function
        :param pop_size: the size of the population
        :param reporting_interval: the interval for reporting progress, passed
                                   on to perform_experiments
        :param weights: the weights on the outcomes
        :param nr_of_generations: number of generations for which the GA will 
                                  be run
        :param crossover_rate: the crossover rate of the GA
        :param mutation_rate: the muation rate of the GA
        :param levers: a dictionary with param keys as keys, and as values
                       info used in mutation.
        
        '''
        self.algorithm = algorithm(weights, levers, generate_individual, obj_function, 
                          pop_size, evaluate_population, nr_of_generations, 
                          crossover_rate, mutation_rate, reporting_interval,
                          self, caching, **kwargs)

        # Begin the generational process
        for _ in range(nr_of_generations):
            pop = self.algorithm.get_population()
        info("-- End of (successful) evolution --")

        return self.algorithm.stats_callback, pop        
def prim_hdr(prims,
             threshold,
             threshold_type):
    '''
    Highest density region for PRIM boxes
    
    prim        list of prim objects
    threshold    
    threshold_type
    
    '''
    
    n = 0
    for entry in prims:
        n += entry.y.shape[0]
    info("number of items in boxes: %s" %n)
  
    boxes = [(entry.y_mean, entry) for entry in prims]
    
    final_list = []
    dump_entries = []
    for entry in boxes:
        if entry[0]*threshold_type >= threshold*threshold_type:
            final_list.append(entry[1])
        else:
            dump_entries.append(entry[1])

    x_temp = None
    for entry in dump_entries: 
        if x_temp == None:
            x_temp = entry.x
            y_temp = entry.y
        else:
            x_temp = np.append(x_temp, entry.x, axis=0) 
            y_temp = np.append(y_temp, entry.y, axis=0)


    dump_box = Prim(x_temp, y_temp, make_box(x_temp), 
                        y_temp.shape[0]/n)
        
    final_list.append(dump_box)

    return final_list
示例#20
0
def __filter(boxes, uncertainties=[]):
    dump_box=boxes[-1]
    boxes=boxes[0:-1]
    
    uv=uncertainties
    #iterate over uncertainties
    names = []

    if uncertainties:
        uv=uncertainties
    else:
        uv = [entry[0] for entry in dump_box.dtype.descr]

    for name in uv:
        
        #determine whether to show
        for box in boxes:
            minimum = box[name][0]
            maximum = box[name][1]
            value = box.dtype.fields.get(name)[0]
            if value == 'object':
                a = dump_box[name][0]
                
                if len(a) != len(minimum):
                    ans = False
                else:
                    ans = np.all(np.equal(a, minimum))
                if not ans:
                    names.append(name)
                    break
            elif (minimum > dump_box[name][0]) or\
                 (maximum < dump_box[name][1]):
                names.append(name)
                break
    a = set(uv) -set(names)
    a = list(a)
    a.sort()
    string_list = ", ".join(a)
    
    info(string_list + " are not not visualized because they are not restricted")
    
    uv = names
    return uv
示例#21
0
def construct_features(data, filterSlope, tHoldSlope, filterCurvature, 
                       tHoldCurvature, addMidExtension, addEndExtension):
    '''
    Constructs a feature vector for each of the data-series contained in the 
    data. 
    
    '''
    info("calculating features")
    
    # TODO, the casting of each feature to a list of tuples might be 
    # removed at some stage, it will lead to a speed up, for you 
    # can vectorize the calculations that use the feature vector
    features = []
    for i in range(data.shape[0]):
        feature = construct_feature_vector(data[i, :], filterSlope, tHoldSlope, 
                                     filterCurvature, tHoldCurvature, 
                                     addMidExtension, addEndExtension)
#        feature =  [tuple(feature[0,:]),tuple(feature[1,:])]
        features.append(feature)
    return features
示例#22
0
def test_save_results():
    # test for 1d
    # test for 2d
    # test for 3d
    # test for very large
    
    nr_experiments = 10000
    experiments = np.recarray((nr_experiments,),
                           dtype=[('x', float), ('y', float)])
    outcome_a = np.random.rand(nr_experiments,1)
    
    results = (experiments, {'a': outcome_a})
    
    save_results(results, r'../data/test.tar.gz')
    os.remove('../data/test.tar.gz')
    ema_logging.info('1d saved successfully')
    
    nr_experiments = 10000
    nr_timesteps = 100
    experiments = np.recarray((nr_experiments,),
                           dtype=[('x', float), ('y', float)])
    outcome_a = np.random.rand(nr_experiments,nr_timesteps)
    
    results = (experiments, {'a': outcome_a})
    save_results(results, r'../data/test.tar.gz')
    os.remove('../data/test.tar.gz')
    ema_logging.info('2d saved successfully')
 
 
    nr_experiments = 10000
    nr_timesteps = 100
    nr_replications = 10
    experiments = np.recarray((nr_experiments,),
                           dtype=[('x', float), ('y', float)])
    outcome_a = np.random.rand(nr_experiments,nr_timesteps,nr_replications)
     
    results = (experiments, {'a': outcome_a})
    save_results(results, r'../data/test.tar.gz')
    os.remove('../data/test.tar.gz')
    ema_logging.info('3d saved successfully')
    
    nr_experiments = 500000
    nr_timesteps = 100
    experiments = np.recarray((nr_experiments,),
                           dtype=[('x', float), ('y', float)])
    outcome_a = np.random.rand(nr_experiments,nr_timesteps)
    
    results = (experiments, {'a': outcome_a})
    save_results(results, r'../data/test.tar.gz')
    os.remove('../data/test.tar.gz')
    ema_logging.info('extremely long saved successfully')
def distance_triangle(data):
    '''
    The triangle distance is calculated as follows;
        Let ds1(.) and ds2(.) be two data series of length N. Then;
        A equals to the summation of ds1(i).ds2(i) from i=1 to N
        B equals to the square-root of the (summation ds1(i)^2 from i=1 to N)
        C equals to the square-root of the (summation ds1(i)^2 from i=1 to N)
        
        distance_triangle = A/(B.C)
     
     The triangle distance works only with data series of the same length
     
     In the literature, it is claimed that the triangle distance can deal with noise and amplitude scaling very well, and may yield poor
     results in cases of offset translation and linear drift.   
    '''
    
    
    
    runLogs = []
    #Generates the feature vectors for all the time series that are contained in numpy array data
    info("calculating distances")
    dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), ))
    index = -1
    for i in range(data.shape[0]):
            
        # For each run, a log is created
        # Log includes a description dictionary that has key information 
        # for post-clustering analysis, and the data series itself. These 
        # logs are stored in a global array named runLogs
        behaviorDesc = {}
        behaviorDesc['Index'] = str(i)
        
        behavior = data[i]
        localLog = (behaviorDesc, behavior)
        runLogs.append(localLog)
    
        for j in range(i+1, data.shape[0]):
            index += 1
            distance = trdist(data[i],data[j]) 
            dRow[index] = distance
    return dRow, runLogs
示例#24
0
def test_load_results():
    # test for 1d
    # test for 2d
    # test for 3d
    # test for nd

    nr_experiments = 10000
    experiments = np.recarray((nr_experiments,),
                           dtype=[('x', float), ('y', float)])
    outcome_a = np.random.rand(nr_experiments,1)
    
    results = (experiments, {'a': outcome_a})
    
    save_results(results, r'../data/test.tar.gz')
    experiments, outcomes  = load_results(r'../data/test.tar.gz')
    
    logical = np.allclose(outcomes['a'],outcome_a)
    
    os.remove('../data/test.tar.gz')
    
    if logical:
        ema_logging.info('1d loaded successfully')
    
    nr_experiments = 1000
    nr_timesteps = 100
    nr_replications = 10
    experiments = np.recarray((nr_experiments,),
                           dtype=[('x', float), ('y', float)])
    outcome_a = np.random.rand(nr_experiments,nr_timesteps,nr_replications)
     
    results = (experiments, {'a': outcome_a})
    save_results(results, r'../data/test.tar.gz')
    experiments, outcomes = load_results(r'../data/test.tar.gz')
    
    logical = np.allclose(outcomes['a'],outcome_a)
    
    os.remove('../data/test.tar.gz')
    
    if logical:
        ema_logging.info('3d loaded successfully')
示例#25
0
def determine_time_dimension(outcomes):
    '''
    
    :param outcomes:
    
    
    '''

    time = None
    try:
        time = outcomes['TIME']
        time = time[0, :]
        outcomes.pop('TIME')
    except KeyError:
        values = iter(outcomes.values())
        for value in values:
            if len(value.shape)==2:
                time =  np.arange(0, value.shape[1])
                break
    if time==None:
        info("no time dimension found in results")
    return time, outcomes    
示例#26
0
    def log_stats(self, gen):
        '''Log statistics on the progress of the evolution'''
        
        functions = {"min":self.minima,
                     "max":self.maxima,
                     "std":self.std,
                     "mean":self.mean,}

        hof = self.__get_hof_in_array()
        info_message = pd.DataFrame(index=['min', 'max', 'mean', 'std'],
                                    columns=['obj_{}'.format(i) for i in
                                             range(hof.shape[1])])
        for key, value in functions.iteritems():
            data = value(hof)
            info_message.loc[key] = data
            
        # let pandas do the formatting for us, but remove the trailing info
        # on the size of the DataFrame
        message = info_message.__str__()
        message = message.split('\n')[0:-2]
        message = "\n".join(message)
        line = "\ngeneration {}\n{}".format(gen,message)
        ema_logging.info(line)
def distance_euclidian(data):
    
    '''
    The Euclidian distance is equal to the square root of (the sum of squared-differences between corresponding dimensions of two N-dimensional vectors) 
    (i.e. two data series of length N).
    Let the data series be of length N; Then Euclidian distance between ds1 and ds2 equals to sqrt(the sum of the square of error terms from 1 to N), 
    where error_term(i) equals to ds1(i)-ds2(i) 
    
    '''
    
    
    
    
    runLogs = []
    #Generates the feature vectors for all the time series that are contained in numpy array data
    info("calculating distances")
    dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), ))
    index = -1
    for i in range(data.shape[0]):
            
        # For each run, a log is created
        # Log includes a description dictionary that has key information 
        # for post-clustering analysis, and the data series itself. These 
        # logs are stored in a global array named runLogs
        behaviorDesc = {}
        behaviorDesc['Index'] = str(i)
        
        behavior = data[i]
        localLog = (behaviorDesc, behavior)
        runLogs.append(localLog)
    
        for j in range(i+1, data.shape[0]):
            index += 1
            distance = eucldist(data[i],data[j]) 
            dRow[index] = distance
    return dRow, runLogs
示例#28
0
def distance_mse(data):
    '''
    The MSE (mean squared-error) distance is equal to the SSE distance divided by the number of data points in data series.
    
    The SSE distance between two data series is equal to the sum of squared-errors between corresponding data points of these two data series.
    Let the data series be of length N; Then SSE distance between ds1 and ds2 equals to the sum of the square of error terms from 1 to N, 
    where error_term(i) equals to ds1(i)-ds2(i) 
    
    Given that SSE is calculated as given above, MSE equals SSE divided by N.
    
    As SSE distance, the MSE distance only works with data series of equal length.
    '''
    
    runLogs = []
    #Generates the feature vectors for all the time series that are contained in numpy array data
    info("calculating distances")
    dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), ))
    index = -1
    for i in range(data.shape[0]):
            
        # For each run, a log is created
        # Log includes a description dictionary that has key information 
        # for post-clustering analysis, and the data series itself. These 
        # logs are stored in a global array named runLogs
        behaviorDesc = {}
        behaviorDesc['Index'] = str(i)
        
        behavior = data[i]
        localLog = (behaviorDesc, behavior)
        runLogs.append(localLog)
    
        for j in range(i+1, data.shape[0]):
            index += 1
            distance = msedist(data[i],data[j]) 
            dRow[index] = distance
    return dRow, runLogs
示例#29
0
def distance_gonenc(data,
                    sisterCount=50, 
                    wSlopeError=1, 
                    wCurvatureError=1,
                    filterSlope=True,
                    tHoldSlope = 0.1,
                    filterCurvature=True,
                    tHoldCurvature=0.1,
                    addMidExtension=True,
                    addEndExtension=True
                    ):
    
    '''
    The distance measures the proximity of data series in terms of their 
    qualitative pattern features. In order words, it quantifies the proximity 
    between two different dynamic behaviour modes.
    
    It is designed to work mainly on non-stationary data. It's current version 
    does not perform well in catching the proximity of two cyclic/repetitive 
    patterns with different number of cycles (e.g. oscillation with 4 cycle 
    versus oscillation with 6 cycles).
    
    :param sisterCount: Number of long-versions that will be created for the 
                        short vector while comparing two data series with 
                        unequal feature vector lengths. 
    :param wSlopeError: Weight of the error between the 1st dimensions of the 
                        two feature vectors (i.e. Slope).
    :param wCurvatureError: Weight of the error between the 2nd dimensions of 
                            the two feature vectors (i.e. Curvature).
    :param wFilterSlope: Whether the slope vectors should be filtered for minor 
                         fluctuations, or not.
    :param tHoldSlope: The threshold value to be used in filtering out 
                       fluctuations in the slope.
    :param wFilterCurvature: Whether the curvature vectors should be filtered 
                             for minor fluctuations, or not.
    :param tHoldCurvature: The threshold value to be used in filtering out 
                           fluctuations in the curvature.
    :param addMidExtension: Whether the feature vectors should be extended by 
                            introducing transition sections along the vector
    :param addEndExtension: Whether the feature vectors should be extended by 
                            introducing startup/closing sections at the 
                            beginning/end of the vector.
    '''
    
    
    runLogs = []
    #Generates the feature vectors for all the time series that are contained 
    # in numpy array data
    features = construct_features(data, filterSlope, tHoldSlope, 
                                  filterCurvature, tHoldCurvature, 
                                  addMidExtension, addEndExtension)
    info("calculating distances")
    dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), ))
    index = -1
    for i in range(data.shape[0]):
        feature_i = features[i]
            
        # For each run, a log is created
        # Log includes a description dictionary that has key information 
        # for post-clustering analysis, and the data series itself. These 
        # logs are stored in a global array named runLogs
        behaviorDesc = {}
        behaviorDesc['Index'] = str(i)
        
        #this may not work due to data type mismatch
        featVector = feature_i
        
        behaviorDesc['Feature vector'] = str(featVector)
        behavior = data[i]
        localLog = (behaviorDesc, behavior)
        runLogs.append(localLog)
    
        for j in range(i+1, data.shape[0]):
            index += 1
            feature_j = features[j]
            if feature_i.shape[1] == feature_j.shape[1]:
                distance = distance_same_length(feature_i, feature_j, 
                                                wSlopeError, wCurvatureError)
    
            else:
                distance = distance_different_lenght(feature_i, 
                                                     feature_j, 
                                                     wSlopeError, 
                                                     wCurvatureError, 
                                                     sisterCount)
            dRow[index] = distance
    return dRow, runLogs
示例#30
0
def perform_loop_knockout():    
    unique_edges = [['In Goods', 'lost'],
                    ['loss unprofitable extraction capacity', 'decommissioning extraction capacity'],
                    ['production', 'In Goods'],
                    ['production', 'lost'],
                    ['production', 'Supply'],
                    ['Real Annual Demand', 'substitution losses'],
                    ['Real Annual Demand', 'price elasticity of demand losses'],
                    ['Real Annual Demand', 'desired extraction capacity'],
                    ['Real Annual Demand', 'economic demand growth'],
                    ['average recycling cost', 'relative market price'],
                    ['recycling fraction', 'lost'],
                    ['commissioning recycling capacity', 'Recycling Capacity Under Construction'],
                    ['maximum amount recyclable', 'recycling fraction'],
                    ['profitability recycling', 'planned recycling capacity'],
                    ['relative market price', 'price elasticity of demand losses'],
                    ['constrained desired recycling capacity', 'gap between desired and constrained recycling capacity'],
                    ['profitability extraction', 'planned extraction capacity'],
                    ['commissioning extraction capacity', 'Extraction Capacity Under Construction'],
                    ['desired recycling', 'gap between desired and constrained recycling capacity'],
                    ['Installed Recycling Capacity', 'decommissioning recycling capacity'],
                    ['Installed Recycling Capacity', 'loss unprofitable recycling capacity'],
                    ['average extraction costs', 'profitability extraction'],
                    ['average extraction costs', 'relative attractiveness recycling']]
    
    unique_cons_edges = [['recycling', 'recycling'],
                           ['recycling', 'supply demand ratio'],
                           ['decommissioning recycling capacity', 'recycling fraction'],
                           ['returns to scale', 'relative attractiveness recycling'],
                           ['shortage price effect', 'relative price last year'],
                           ['shortage price effect', 'profitability extraction'],
                           ['loss unprofitable extraction capacity', 'loss unprofitable extraction capacity'],
                           ['production', 'recycling fraction'],
                           ['production', 'constrained desired recycling capacity'],
                           ['production', 'new cumulatively recycled'],
                           ['production', 'effective fraction recycled of supplied'],
                           ['loss unprofitable recycling capacity', 'recycling fraction'],
                           ['average recycling cost', 'loss unprofitable recycling capacity'],
                           ['recycling fraction', 'new cumulatively recycled'],
                           ['substitution losses', 'supply demand ratio'],
                           ['Installed Extraction Capacity', 'Extraction Capacity Under Construction'],
                           ['Installed Extraction Capacity', 'commissioning extraction capacity'],
                           ['Installed Recycling Capacity', 'Recycling Capacity Under Construction'],
                           ['Installed Recycling Capacity', 'commissioning recycling capacity'],
                           ['average extraction costs', 'profitability extraction']]
    
#    CONSTRUCTING THE ENSEMBLE AND SAVING THE RESULTS
    ema_logging.log_to_stderr(ema_logging.INFO)
    results = load_results(r'base.cPickle')

#    GETTING OUT THOSE BEHAVIOURS AND EXPERIMENT SETTINGS
#    Indices of a number of examples, these will be looked at.
    runs = [526,781,911,988,10,780,740,943,573,991]
    VOI = 'relative market price'
    
    results_of_interest = experiment_settings(results,runs,VOI)
    cases_of_interest = experiments_to_cases(results_of_interest[0])
    behaviour_int = results_of_interest[1][VOI]
    
#    CONSTRUCTING INTERVALS OF ATOMIC BEHAVIOUR PATTERNS
    ints = intervals(behaviour_int,False)

#    GETTING OUT ONLY THOSE OF MAXIMUM LENGTH PER BEHAVIOUR
    max_intervals = intervals_interest(ints)
    
#    THIS HAS TO DO WITH THE MODEL FORMULATION OF THE SWITCHES/VALUES
    double_list = [6,9,11,17,19]
    
    indCons = len(unique_edges)
#    for elem in unique_cons_edges:
#        unique_edges.append(elem)
    
    current = os.getcwd()

    for beh_no in range(0,10):
#        beh_no = 0 # Varies between 0 and 9, index style.
        interval = max_intervals[beh_no]
    
        rmp = behaviour_int[beh_no]
    #    rmp = rmp[interval[0]:interval[1]]
        x = range(0,len(rmp))
        fig = plt.figure()
        ax = fig.add_subplot(111)
    
        vensim.be_quiet()
    #    for loop_index in range(7,8):
        for loop_index in range(1,len(unique_edges)+1):
    
            if loop_index-indCons > 0:
                model_location = current + r'\Models\Consecutive\Metals EMA.vpm'
            elif loop_index == 0:
                model_location = current + r'\Models\Base\Metals EMA.vpm'
            else:
                model_location = current + r'\Models\Switches\Metals EMA.vpm'
        
            serie = run_interval(model_location,loop_index,
                                  interval,'relative market price',
                                  unique_edges,indCons,double_list,
                                  cases_of_interest[beh_no])
            
            if serie.shape != rmp.shape:
                ema_logging.info('Loop %s created a floating point error' % (loop_index))
                ema_logging.info('Caused by trying to switch %s' % (unique_edges[loop_index-1]))
                
            if serie.shape == rmp.shape:
                ax.plot(x,serie,'b')
                
    #        data = np.zeros(rmp.shape[0])
    #        data[0:serie.shape[0]] = serie
    #        ax.plot(x,data,'b')
      
        ax.plot(x,rmp,'r')
        ax.axvspan(interval[0]-1,interval[1], facecolor='lightgrey', alpha=0.5)
        f_name = 'switched unique edges only'+str(beh_no)
        plt.savefig(f_name)