def setup_a_column(self, i, column_name): if self.print: print_to_file('column # ' + str(i) + ' ' + column_name) # Sets parameters for folders of each column self.model.experiment_config.current_column = i self.model.experiment_config.current_column_name = column_name.replace(" ", "") self.model.experiment_config.current_experiment_folder = self.model.experiment_config.main_experiments_folder + '/' + self.model.experiment_config.dataset_name + '/' + self.model.experiment_config.current_column_name # Removes existing folders accordingly if i == 0: _start_over_report = True else: _start_over_report = False create_folders(self.model, _start_over_report)
def g_cols(self, w_j_z): print_to_file('g_cols is called') time_init = time.time() # calculates the gradient vector, i.e. df/dw (=df/dz * dz/dw) where f is the object function to minimize. # it returns -g_j because of minimizing instead of maximizing. see the objective function. runner = self.current_runner # updates the parameters runner, temp_w_j_z = self.set_all_probabilities_z(runner, w_j_z) # generates probabilities # time_init2 = time.time() self.all_probs = runner.generate_machine_probabilities( self.unique_vals) # print_to_file(str(time.time() - time_init2)) q_total = None counter_ = 0 for i, (data_frame, labels) in enumerate(zip(self.data_frames, self.labels)): # print(i) for j, column_name in enumerate(list(data_frame.columns)): time_temp1 = time.time() # print(column_name) if counter_ == 0: q_total = self.g_col_marginals(runner, str(i), str(column_name), labels[j] - 1) counter_ += 1 else: q_total += self.g_col_marginals(runner, str(i), str(column_name), labels[j] - 1) print_to_file(str(time.time() - time_init)) # print_to_file('grad chek is called.') # q_approx = self.grad_chek(w_j_z) # print(q_total) # print_to_file(q_approx) # print(q_total-q_approx) # print_to_file('gradient norm ' + str(vecnorm(q_total, ord=np.Inf))) # print_to_file('gradient approx norm' + str(vecnorm(q_approx, ord=np.Inf))) # print_to_file('gradients diff norm' + str(vecnorm(q_total - q_approx))) return q_total
def run_inference(self, _data_frame, _print=False, _prediction_path=None, _save=False): """ Runs ptype for each column in a dataframe. The outputs are stored in dictionaries (see store_outputs). The column types are saved to a csv file. :param _data_frame: :param _print: :param _prediction_path: :param _save: """ self.set_data(_data_frame=_data_frame) self.print = _print self.prediction_path = _prediction_path if self.print: print_to_file('processing ' + self.model.experiment_config.dataset_name) # Normalizing the parameters to make sure they're probabilities self.normalize_params() # Generates a binary mask matrix to check if a word is supported by a PFSM or not. (this is just to optimize the implementation.) self.PFSMRunner.update_values(np.unique(self.model.data.values)) for i, column_name in enumerate(list(self.model.experiment_config.column_names)): # self.setup_a_column(i, column_name) # Calculates the probabilities probabilities, counts = self.generate_probs_a_column(column_name) # Runs inference for a column self.run_inference_on_model(probabilities, counts) # Stores types, both cols types and rows types self.store_outputs(column_name) # Export column types, and missing data if _save: self.write_type_predictions_2_csv(list(self.predicted_types.values()))
def run_inference_on_model(self, probs, counts): if self.print: print_to_file('\tinference is running...') self.model.run_inference(probs, counts)
def train_machines_multiple_dfs_new(self, _labels, _experiment_output_name='demo',_max_iter=20, _prediction_path=None, _print=False, _test_data=None, _test_labels=None, _uniformly=False): """ Train the PFSMs given a set of dataframes and their labels :param _labels: column types labeled by hand, where _label[i][j] denotes the type of j^th column in i^th dataframe. :param _experiment_output_name: :param _max_iter: the maximum number of iterations the optimization algorithm runs as long as it's not converged. :param _prediction_path: :param _print: :param _test_data: :param _test_labels: :param _uniformly: a binary variable used to initialize the PFSMs - True allows initializing uniformly rather than using hand-crafted values. :return: """ self.print = _print self.prediction_path = _prediction_path self.experiment_output_name = _experiment_output_name if _uniformly: self.initialize_params_uniformly() # Setup folders and probabilities for all columns self.normalize_params() # Changing column names self.data_frames = [data_frame.rename(columns=lambda n: str(n).replace(' ', '')) for data_frame in self.data_frames] self.model.data_frames = self.data_frames # find the unique values in all of the columns once for i, df in enumerate(self.model.data_frames): if i == 0: unique_vals = np.unique(df.values) else: unique_vals = np.concatenate((unique_vals, np.unique(df.values))) self.model.unique_vals = unique_vals self.PFSMRunner.set_unique_values(unique_vals) # Finding unique values and their counts self.model.dfs_unique_vals_counts = {} for i, df in enumerate(self.data_frames): df_unique_vals_counts = {} for column_name in list(df.columns): temp_x, counts = np.unique([str(int_element) for int_element in df[column_name].tolist()], return_counts=True) counts = {u_data: c for u_data, c in zip(temp_x, counts)} temp_counts = list(counts.values()) counts_array = np.reshape(temp_counts, newshape=(len(temp_counts),)) df_unique_vals_counts[column_name] = [temp_x, counts_array] self.model.dfs_unique_vals_counts[str(i)] = df_unique_vals_counts # Setting self.model.labels = _labels self.model.types = self.types self.model.J = len(self.PFSMRunner.machines) # J: num of data types including missing and anomaly. self.model.K = self.model.J - 2 # K: num of possible column data types (excluding missing and anomaly) self.model.pi = [self.model.PI for j in range(self.model.K)] # mixture weights of row types self.model.current_runner = self.PFSMRunner training_error = [] training_error.append(self.calculate_error_df(self.data_frames, _labels)) save_object(self.PFSMRunner, self.experiment_output_name + '_training_runner_initial.pkl') print(training_error) # Iterates over whole data points for it in range(_max_iter): print_to_file('iteration = ' + str(it), filename=self.experiment_output_name + '_output.txt') # Trains machines using all of the training data frames self.PFSMRunner = self.train_all_models_multiple_dfs(self.PFSMRunner) self.model.current_runner = self.PFSMRunner # Calculate training and validation error at each iteration training_error.append(self.calculate_error_df(self.data_frames, _labels)) print(training_error) if it > 0: if (training_error[-2] - training_error[-1] < 1e-2): print_to_file('converged!', filename=self.experiment_output_name + '_output.txt') save_object(self.PFSMRunner, self.experiment_output_name + '_training_runner' + str(it) + '.pkl') break save_object(self.PFSMRunner, self.experiment_output_name + '_training_runner' + str(it) + '.pkl') save_object(training_error, self.experiment_output_name + '_training_error.pkl')
def train_all_models_multiple_dfs(self, runner): if self.print: print_to_file('\ttraining is running...') return self.model.train_all_z_multiple_dfs_new(runner)