Exemplo n.º 1
0
def iteration_step(training_data, input_coordinates, structure, C_old, U_old,
                   k, shape_of_grid, time_frame_sums, T, W, ES,
                   valid_timesteps, evaluation_dataset, edges_of_cell):
    """
    input: path string, path to file
           input_coordinates numpy array, coordinates for model creation
           structure list(int, list(floats), list(floats)),
                      number of non-hypertime dimensions, list of hypertime
                      radii nad list of wavelengths
           C_old numpy array kxd, centres from last iteration
           U_old numpy array kxn, matrix of weights from the last iteration
           k positive integer, number of clusters
           shape_of_grid numpy array dx1 int64, number of cells in every
                                                dimension
           time_frame_sums numpy array shape_of_grid[0]x1, sum of measures
                                                            over every
                                                            timeframe
           T numpy array shape_of_grid[0]x1, time positions of timeframes
           W numpy array Lx1, sequence of reasonable frequencies
           ES float64, squared sum of squares of residues from this iteration
    output: dES float64, difference between last and new error
            structure list(int, list(floats), list(floats)),
                      number of non-hypertime dimensions, list of hypertime
                      radii nad list of wavelengths
            C numpy array kxd, matrix of k d-dimensional cluster centres
            U numpy array kxn, matrix of weights
            COV numpy array kxdxd, matrix of covariance matrices
            density_integrals numpy array kx1, matrix of ratios between
                                               measurements and grid cells
                                               belonging to the clusters
            W numpy array Lx1, sequence of reasonable frequencies
            ES float64, squared sum of squares of residues from this iteration
            P float64, length of the most influential frequency in default
                       units
    uses: mdl.model_creation(), fm.chosen_period()
          np.sum()
    objective:
    """
    #### testuji zmenu "sily" period pri pridavani shluku
    hist_freqs, C, U, COV, density_integrals =\
        mdl.model_creation(input_coordinates,
                           structure, training_data, C_old, U_old, k,
                           shape_of_grid)
    osy = tuple(np.arange(len(np.shape(hist_freqs)) - 1) + 1)
    time_frame_freqs = np.sum(hist_freqs, axis=osy)
    P, W, ES, sum_of_amplitudes = fm.chosen_period(T, time_frame_sums,
                                                   time_frame_freqs, W, ES,
                                                   valid_timesteps)
    diff = ev.evaluation_step(evaluation_dataset, C, COV, density_integrals,\
                                      structure, k, edges_of_cell)
    #### konec testovani
    #print('chosen k: ' + str(k))
    #print('and the diff: ' + str(diff))
    return sum_of_amplitudes, C, U, COV, density_integrals, W,\
        ES, P, diff
Exemplo n.º 2
0
def best_diff(training_data, domain_coordinates, domain_values, frequencies, k,
              new_structure, params, transformation, eval_dataset):
    """
    """
    X = dio.create_X(training_data, new_structure, transformation)
    DOMAIN = dio.create_X(domain_coordinates, new_structure, transformation)

    eval_domain = (dio.create_X(eval_dataset[0], new_structure,
                                transformation), eval_dataset[1],
                   dio.create_X(eval_dataset[2], new_structure,
                                transformation), eval_dataset[3])

    list_of_others = []
    list_of_diffs = []
    list_of_differences = []

    #test_vals = []
    #test_times = []
    #for file in xrange(1, 10):
    #    test_vals.append(np.loadtxt('../data/test_data_' + str(file) + '.txt'))
    #    test_times.append(dio.create_X(np.loadtxt('../data/test_times_' + str(file) + '.txt').reshape(-1,1), new_structure, transformation))

    for j in xrange(
            21
    ):  # for the case that the clustering would fail TRY TO DO IT ONLY ONCE BECAUSE OF NEW INITIALIZATION !!!
        diff_j, C_j, densities_j, COV_j, difference_j, heights_j =\
            iteration_step(DOMAIN, domain_values, X, k, new_structure, params, eval_domain)
        list_of_diffs.append(diff_j)
        list_of_others.append((diff_j, C_j, densities_j, COV_j, k, heights_j))
        list_of_differences.append(difference_j)

        #predictions = []
        #for q in xrange(9):
        #    out = es.training_model(test_times[q], C_j, densities_j, COV_j, k, params, new_structure, heights_j)
        #    vals = test_vals[q]
        #    predictions.append(np.mean((out - vals) ** 2))
        #
        #print(diff_j, fm.chosen_period(domain_coordinates[:, 0], difference_j, frequencies)[1], tuple(predictions))
    list_of_diffs = np.array(list_of_diffs)
    chosen_model = np.where(np.median(list_of_diffs))[0][
        0]  # find index of median difference between model and training data
    the_period, tested_sum_of_amplitudes = fm.chosen_period(
        domain_coordinates[:, 0], list_of_differences[chosen_model],
        frequencies)  # tested_sum_of_amplitudes not used in this version
    diff, C, densities, COV, k, heights = list_of_others[chosen_model]
    return diff, C, densities, COV, the_period, k, heights
def whole_initialization(training_data, k, edges_of_cell, longest, shortest,
                         training_dataset):
    """
    input: path string, path to file
           k positive integer, number of clusters
           edge_of_square float, spatial edge of cell in default units (meters)
           timestep float, time edge of cell in default units (seconds)
           longest float, legth of the longest wanted period in default
                          units
           shortest float, legth of the shortest wanted period
                           in default units
    output: input_coordinates numpy array, coordinates for model creation
            overall_sum number (np.float64 or np.int64), sum of all measures
            structure list(int, list(floats), list(floats)),
                      number of non-hypertime dimensions, list of hypertime
                      radii nad list of wavelengths
            C numpy array kxd, matrix of k d-dimensional cluster centres
            U numpy array kxn, matrix of weights
            shape_of_grid numpy array dx1 int64, number of cells in every
                                                 dimension
            time_frame_sums numpy array shape_of_grid[0]x1, sum of measures
                                                            over every
                                                            timeframe
            T numpy array shape_of_grid[0]x1, time positions of timeframes
            W numpy array Lx1, sequence of reasonable frequencies
            ES float64, squared sum of squares of residues from this iteration
            COV numpy array kxdxd, matrix of covariance matrices
            density_integrals numpy array kx1, matrix of ratios between
                                               measurements and grid cells
                                               belonging to the clusters
    uses: first_structure(), mdl.model_creation(), grid.time_space_positions(),
          first_time_frame_freqs(), fm.build_frequencies(), fm.chosen_period()
    objective: to perform first iteration step and to initialize variables
    """
    print('starting learning iteration: 0 (initialization)')
    structure = first_structure(training_data)
    input_coordinates, time_frame_sums, overall_sum, shape_of_grid, T,\
        valid_timesteps = grid.time_space_positions(edges_of_cell,
                                                    training_data,
                                                    training_dataset)
    if len(shape_of_grid[0]) == 1:
        hist_freqs = -1
        C = -1
        U = -1
        COV = -1
        density_integrals = -1
    else:
        hist_freqs, C, U, COV, density_integrals =\
            mdl.model_creation(input_coordinates, structure, training_data,
                               0, 0,  # C_in and U_in
                               k, shape_of_grid)
    time_frame_freqs = first_time_frame_freqs(overall_sum, shape_of_grid[0])
    W = fm.build_frequencies(longest, shortest)
    ES = -1  # no previous error
    P, W, ES, dES = fm.chosen_period(T, time_frame_sums, time_frame_freqs[0],
                                     W, ES, valid_timesteps)
    print('used structure: ' + str(structure))
    print('leaving learning iteration: 0 (initialization)')
    return input_coordinates, overall_sum, structure, C,\
        U, shape_of_grid, time_frame_sums, T, W, ES, P, COV,\
        density_integrals, valid_timesteps
Exemplo n.º 4
0
def proposed_method(domain_coordinates, domain_values, training_data,
                    eval_dataset, params, evaluation):
    """
    input: longest float, legth of the longest wanted period in default
                          units
           shortest float, legth of the shortest wanted period
                           in default units
           dataset numpy array, columns: time, vector of measurements, 0/1
                                (occurence of event)
           edge_of_square float, spatial edge of cell in default units (meters)
           timestep float, time edge of cell in default units (seconds)
           k positive integer, number of clusters
           radius float, size of radius of the first found hypertime circle
           number_of_periods int, max number of added hypertime circles
           evaluation boolean, stop learning when the error starts to grow?
    output: C numpy array kxd, matrix of k d-dimensional cluster centres
            COV numpy array kxdxd, matrix of covariance matrices
            density_integrals numpy array kx1, matrix of ratios between
                                               measurements and grid cells
                                               belonging to the clusters
            structure list(int, list(floats), list(floats)),
                      number of non-hypertime dimensions, list of hypertime
                      radii nad list of wavelengths
            average DODELAT
    uses: time.clock()
          init.whole_initialization(), iteration_step()
    objective: to learn model parameters
    """
    if evaluation[0] == False:
        # for the future to know the strusture of evaluation
        edges_of_cell = evaluation[1]
        edges_of_big_cell = evaluation[2]
        transformation = evaluation[3]
        max_number_of_periods = evaluation[4]  # not used here
        longest, shortest = evaluation[5]  # not used here
        structure = evaluation[6]  # not used for evaluation[0] = True
        k = evaluation[7]  # not used for evaluation[0] = True

        X = dio.create_X(training_data, structure, transformation)
        DOMAIN = dio.create_X(domain_coordinates, structure, transformation)
        eval_domain = (dio.create_X(eval_dataset[0], structure,
                                    transformation), eval_dataset[1],
                       dio.create_X(eval_dataset[2], structure,
                                    transformation), eval_dataset[3])
        diff, C, densities, COV, difference, heights = iteration_step(
            DOMAIN, domain_values, X, k, structure, params, eval_domain)
        #C, U = cl.iteration(X, k, structure, params)
        #densities, COV = ca.body(DOMAIN, X, C, U, k, params, structure)
    else:
        edges_of_cell = evaluation[1]
        edges_of_big_cell = evaluation[2]
        transformation = evaluation[3]
        max_number_of_periods = evaluation[4]
        longest, shortest = evaluation[5]
        k = evaluation[7]
        # initialization
        frequencies = it.build_frequencies(longest, shortest)
        structure = it.first_structure(training_data)
        if structure[0] == 0 and structure[1] == []:
            # there is nothing to cluster, we have to create new structure with one 'circle' before clustering
            average = domain_values / len(domain_values)
            #C = np.array([average])
            #COV = C/10
            #densities = np.array([[average]])
            #k = 1
            #chosen_period(T, S, W)
            the_period = fm.chosen_period(domain_coordinates[:, 0],
                                          domain_values - average,
                                          frequencies)[0]
            #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            structure[1].append(bs.radius(the_period, structure))
            structure[2].append(the_period)
            WW = list(frequencies)
            #print(1/the_period)
            WW.remove(1 / the_period)  # P
            frequencies = np.array(WW)
            print('nothing to cluster, periodicity ' + str(the_period) +
                  ' chosen and the corresponding frequency removed')
        # create model
        diff, C, densities, COV, the_period, k, heights = best_diff(
            training_data, domain_coordinates, domain_values, frequencies, k,
            structure, params, transformation, eval_dataset)
        jump_out = 0
        iteration = 0
        #diff = -1
        while jump_out == 0:
            #print('\nstarting learning iteration: ' + str(iteration))
            #print('with number of clusters: ' + str(k))
            #print('and the structure: ' + str(structure))
            iteration += 1
            start = clock()
            jump_out, diff, C, densities, COV, the_period, structure, frequencies, k, heights = \
                step_evaluation(diff, C, densities, COV, the_period, structure, frequencies, training_data, domain_coordinates, domain_values, transformation, k, params, heights, eval_dataset)
            finish = clock()
            print('structure: ' + str(structure) + ', number of clusters: ' +
                  str(k) + ', and difference to training data: ' + str(diff))
            #print('leaving learning iteration: ' + str(iteration))
            #print('processor time: ' + str(finish - start))
            if len(structure[1]) >= max_number_of_periods:
                jump_out = 1
        #print('learning iterations finished')
    #return C, densities, COV, k, params, structure  # to poradi pak budu muset zvazit ... proc vracim params????
    return C, densities, COV, k, structure, heights