예제 #1
0
    def setup_a_column(self, i, column_name):
        if self.print:
            print_to_file('column # ' + str(i) + ' ' + column_name)

        # Sets parameters for folders of each column
        self.model.experiment_config.current_column = i
        self.model.experiment_config.current_column_name = column_name.replace(" ", "")
        self.model.experiment_config.current_experiment_folder = self.model.experiment_config.main_experiments_folder + '/' + self.model.experiment_config.dataset_name + '/' + self.model.experiment_config.current_column_name

        # Removes existing folders accordingly
        if i == 0:
            _start_over_report = True
        else:
            _start_over_report = False

        create_folders(self.model, _start_over_report)
예제 #2
0
파일: Model.py 프로젝트: wrattler/wrattler
    def g_cols(self, w_j_z):
        print_to_file('g_cols is called')
        time_init = time.time()

        # calculates the gradient vector, i.e. df/dw (=df/dz * dz/dw) where f is the object function to minimize.
        # it returns -g_j because of minimizing instead of maximizing. see the objective function.
        runner = self.current_runner

        # updates the parameters
        runner, temp_w_j_z = self.set_all_probabilities_z(runner, w_j_z)

        # generates probabilities
        # time_init2 = time.time()
        self.all_probs = runner.generate_machine_probabilities(
            self.unique_vals)
        # print_to_file(str(time.time() - time_init2))

        q_total = None
        counter_ = 0

        for i, (data_frame,
                labels) in enumerate(zip(self.data_frames, self.labels)):
            # print(i)
            for j, column_name in enumerate(list(data_frame.columns)):
                time_temp1 = time.time()
                # print(column_name)
                if counter_ == 0:
                    q_total = self.g_col_marginals(runner, str(i),
                                                   str(column_name),
                                                   labels[j] - 1)
                    counter_ += 1
                else:
                    q_total += self.g_col_marginals(runner, str(i),
                                                    str(column_name),
                                                    labels[j] - 1)

        print_to_file(str(time.time() - time_init))
        # print_to_file('grad chek is called.')
        # q_approx = self.grad_chek(w_j_z)
        # print(q_total)
        # print_to_file(q_approx)
        # print(q_total-q_approx)
        # print_to_file('gradient norm ' + str(vecnorm(q_total, ord=np.Inf)))
        # print_to_file('gradient approx norm' + str(vecnorm(q_approx, ord=np.Inf)))
        # print_to_file('gradients diff norm' + str(vecnorm(q_total - q_approx)))

        return q_total
예제 #3
0
    def run_inference(self, _data_frame, _print=False, _prediction_path=None, _save=False):
        """ Runs ptype for each column in a dataframe.
            The outputs are stored in dictionaries (see store_outputs).
            The column types are saved to a csv file.

        :param _data_frame:
        :param _print:
        :param _prediction_path:
        :param _save:

        """

        self.set_data(_data_frame=_data_frame)

        self.print = _print
        self.prediction_path = _prediction_path

        if self.print:
            print_to_file('processing ' + self.model.experiment_config.dataset_name)

        # Normalizing the parameters to make sure they're probabilities
        self.normalize_params()

        # Generates a binary mask matrix to check if a word is supported by a PFSM or not. (this is just to optimize the implementation.)
        self.PFSMRunner.update_values(np.unique(self.model.data.values))

        for i, column_name in enumerate(list(self.model.experiment_config.column_names)):
            # self.setup_a_column(i, column_name)

            # Calculates the probabilities
            probabilities, counts = self.generate_probs_a_column(column_name)

            # Runs inference for a column
            self.run_inference_on_model(probabilities, counts)

            # Stores types, both cols types and rows types
            self.store_outputs(column_name)

        # Export column types, and missing data
        if _save:
            self.write_type_predictions_2_csv(list(self.predicted_types.values()))
예제 #4
0
 def run_inference_on_model(self, probs, counts):
     if self.print:
         print_to_file('\tinference is running...')
     self.model.run_inference(probs, counts)
예제 #5
0
    def train_machines_multiple_dfs_new(self, _labels, _experiment_output_name='demo',_max_iter=20, _prediction_path=None, _print=False,  _test_data=None, _test_labels=None, _uniformly=False):
        """ Train the PFSMs given a set of dataframes and their labels

        :param _labels: column types labeled by hand, where _label[i][j] denotes the type of j^th column in i^th dataframe.
        :param _experiment_output_name:
        :param _max_iter: the maximum number of iterations the optimization algorithm runs as long as it's not converged.
        :param _prediction_path:
        :param _print:
        :param _test_data:
        :param _test_labels:
        :param _uniformly: a binary variable used to initialize the PFSMs - True allows initializing uniformly rather than using hand-crafted values.
        :return:
        """
        self.print = _print
        self.prediction_path = _prediction_path
        self.experiment_output_name = _experiment_output_name

        if _uniformly:
            self.initialize_params_uniformly()

        # Setup folders and probabilities for all columns
        self.normalize_params()

        # Changing column names
        self.data_frames = [data_frame.rename(columns=lambda n: str(n).replace(' ', '')) for data_frame in self.data_frames]
        self.model.data_frames = self.data_frames

        # find the unique values in all of the columns once
        for i, df in enumerate(self.model.data_frames):
            if i == 0:
                unique_vals = np.unique(df.values)
            else:
                unique_vals = np.concatenate((unique_vals, np.unique(df.values)))
        self.model.unique_vals = unique_vals

        self.PFSMRunner.set_unique_values(unique_vals)

        # Finding unique values and their counts
        self.model.dfs_unique_vals_counts = {}
        for i, df in enumerate(self.data_frames):
            df_unique_vals_counts = {}
            for column_name in list(df.columns):
                temp_x, counts = np.unique([str(int_element) for int_element in df[column_name].tolist()], return_counts=True)
                counts = {u_data: c for u_data, c in zip(temp_x, counts)}
                temp_counts = list(counts.values())
                counts_array = np.reshape(temp_counts, newshape=(len(temp_counts),))
                df_unique_vals_counts[column_name] = [temp_x, counts_array]
            self.model.dfs_unique_vals_counts[str(i)] = df_unique_vals_counts

        # Setting
        self.model.labels = _labels
        self.model.types = self.types
        self.model.J = len(self.PFSMRunner.machines)  # J: num of data types including missing and anomaly.
        self.model.K = self.model.J - 2  # K: num of possible column data types (excluding missing and anomaly)
        self.model.pi = [self.model.PI for j in range(self.model.K)]  # mixture weights of row types
        self.model.current_runner = self.PFSMRunner

        training_error = []
        training_error.append(self.calculate_error_df(self.data_frames, _labels))

        save_object(self.PFSMRunner, self.experiment_output_name + '_training_runner_initial.pkl')
        print(training_error)

        # Iterates over whole data points
        for it in range(_max_iter):
            print_to_file('iteration = ' + str(it), filename=self.experiment_output_name + '_output.txt')

            # Trains machines using all of the training data frames
            self.PFSMRunner = self.train_all_models_multiple_dfs(self.PFSMRunner)
            self.model.current_runner = self.PFSMRunner

            # Calculate training and validation error at each iteration
            training_error.append(self.calculate_error_df(self.data_frames, _labels))
            print(training_error)

            if it > 0:
                if (training_error[-2] - training_error[-1] < 1e-2):
                    print_to_file('converged!', filename=self.experiment_output_name + '_output.txt')
                    save_object(self.PFSMRunner, self.experiment_output_name + '_training_runner' + str(it) + '.pkl')
                    break

            save_object(self.PFSMRunner, self.experiment_output_name + '_training_runner' + str(it) + '.pkl')
        save_object(training_error, self.experiment_output_name + '_training_error.pkl')
예제 #6
0
 def train_all_models_multiple_dfs(self, runner):
     if self.print:
         print_to_file('\ttraining is running...')
     return self.model.train_all_z_multiple_dfs_new(runner)