def __init__(self, env, name="session"): """ Constructor for Holoclean session :param env: Holoclean environment :param name: Name for the Holoclean session """ # use DEBUG logging level if verbose enabled if env['verbose']: root_logger.setLevel(logging.DEBUG) gensim_logger.setLevel(logging.DEBUG) logging.debug('initiating session with parameters: %s', env) # Initialize random seeds. random.seed(env['seed']) torch.manual_seed(env['seed']) np.random.seed(seed=env['seed']) # Initialize members self.name = name self.env = env self.ds = Dataset(name, env) self.dc_parser = Parser(env, self.ds) self.domain_engine = DomainEngine(env, self.ds) self.detect_engine = DetectEngine(env, self.ds) self.repair_engine = RepairEngine(env, self.ds) self.eval_engine = EvalEngine(env, self.ds)
def __init__(self, env, name="session"): """ Constructor for Holoclean session :param env: Holoclean environment :param name: Name for the Holoclean session """ # Initialize members self.name = name self.env = env self.ds = Dataset(name, env) self.dc_parser = Parser(env, self.ds) self.domain_engine = DomainEngine(env, self.ds) self.detect_engine = DetectEngine(env, self.ds) self.repair_engine = RepairEngine(env, self.ds) self.eval_engine = EvalEngine(env, self.ds) # use DEBUG logging level if verbose enabled root_logger = logging.getLogger() gensim_logger = logging.getLogger('gensim') root_level, gensim_level = logging.INFO, logging.WARNING if self.env['verbose']: root_level, gensim_level = logging.DEBUG, logging.DEBUG root_logger.setLevel(root_level) gensim_logger.setLevel(gensim_level)
def __init__(self, env, name="session"): """ Constructor for Holoclean session :param env: Holoclean environment :param name: Name for the Holoclean session """ # Initialize members self.name = name self.env = env self.ds = Dataset(name, env) self.dc_parser = Parser(env, self.ds) self.domain_engine = DomainEngine(env, self.ds) self.detect_engine = DetectEngine(env, self.ds) self.repair_engine = RepairEngine(env, self.ds) self.eval_engine = EvalEngine(env, self.ds)
class Session: """ Session class controls the entire pipeline of HC """ def __init__(self, env, name="session"): """ Constructor for Holoclean session :param env: Holoclean environment :param name: Name for the Holoclean session """ # use DEBUG logging level if verbose enabled if env['verbose']: root_logger.setLevel(logging.DEBUG) gensim_logger.setLevel(logging.DEBUG) logging.debug('initiating session with parameters: %s', env) # Initialize random seeds. random.seed(env['seed']) torch.manual_seed(env['seed']) np.random.seed(seed=env['seed']) # Initialize members self.name = name self.env = env self.ds = Dataset(name, env) self.dc_parser = Parser(env, self.ds) self.domain_engine = DomainEngine(env, self.ds) self.detect_engine = DetectEngine(env, self.ds) self.repair_engine = RepairEngine(env, self.ds) self.eval_engine = EvalEngine(env, self.ds) def load_data(self, name, fpath, na_values=None, entity_col=None, src_col=None): """ load_data takes the filepath to a CSV file to load as the initial dataset. :param name: (str) name to initialize dataset with. :param fpath: (str) filepath to CSV file. :param na_values: (str) value that identifies a NULL value :param entity_col: (st) column containing the unique identifier/ID of an entity. For fusion tasks, rows with the same ID will be fused together in the output. If None, assumes every row is a unique entity. :param src_col: (str) if not None, for fusion tasks specifies the column containing the source for each "mention" of an entity. """ status, load_time = self.ds.load_data(name, fpath, na_values=na_values, entity_col=entity_col, src_col=src_col) logging.info(status) logging.debug('Time to load dataset: %.2f secs', load_time) def load_dcs(self, fpath): """ load_dcs ingests the Denial Constraints for initialized dataset. :param fpath: filepath to TXT file where each line contains one denial constraint. """ status, load_time = self.dc_parser.load_denial_constraints(fpath) logging.info(status) logging.debug('Time to load dirty data: %.2f secs', load_time) def get_dcs(self): return self.dc_parser.get_dcs() def detect_errors(self, detect_list): status, detect_time = self.detect_engine.detect_errors(detect_list) logging.info(status) logging.debug('Time to detect errors: %.2f secs', detect_time) def setup_domain(self): status, domain_time = self.domain_engine.setup() logging.info(status) logging.debug('Time to setup the domain: %.2f secs', domain_time) def repair_errors(self, featurizers): status, feat_time = self.repair_engine.setup_featurized_ds(featurizers) logging.info(status) logging.debug('Time to featurize data: %.2f secs', feat_time) status, setup_time = self.repair_engine.setup_repair_model() logging.info(status) logging.debug('Time to setup repair model: %.2f secs', feat_time) status, fit_time = self.repair_engine.fit_repair_model() logging.info(status) logging.debug('Time to fit repair model: %.2f secs', fit_time) status, infer_time = self.repair_engine.infer_repairs() logging.info(status) logging.debug('Time to infer correct cell values: %.2f secs', infer_time) status, time = self.ds.get_inferred_values() logging.info(status) logging.debug('Time to collect inferred values: %.2f secs', time) status, time = self.ds.get_repaired_dataset() logging.info(status) logging.debug('Time to store repaired dataset: %.2f secs', time) if self.env['print_fw']: status, time = self.repair_engine.get_featurizer_weights() logging.info(status) logging.debug('Time to store featurizer weights: %.2f secs', time) return status def evaluate(self, fpath, tid_col, attr_col, val_col, na_values=None): """ evaluate generates an evaluation report with metrics (e.g. precision, recall) given a test set. :param fpath: (str) filepath to test set (ground truth) CSV file. :param tid_col: (str) column in CSV that corresponds to the TID. :param attr_col: (str) column in CSV that corresponds to the attribute. :param val_col: (str) column in CSV that corresponds to correct value for the current TID and attribute (i.e. cell). :param na_values: (Any) how na_values are represented in the data. Returns an EvalReport named tuple containing the experiment results. """ name = self.ds.raw_data.name + '_clean' print("name") status, load_time = self.eval_engine.load_data(name, fpath, tid_col, attr_col, val_col, na_values=na_values) logging.info(status) logging.debug('Time to evaluate repairs: %.2f secs', load_time) status, report_time, eval_report = self.eval_engine.eval_report() logging.info(status) logging.debug('Time to generate report: %.2f secs', report_time) return eval_report def explain_repairs(self, detectors): """ TODO Document :param detectors: (list) of ErrorDetector objects """ graph_time = self.explain_engine.build_constraints_graphs() logging.debug('Time to build constraints graph: %.2f secs', graph_time) explanation, explain_time = self.explain_engine.explain_repairs( detectors) logging.info(explanation) logging.debug('Time to generate explanation: %.2f secs', explain_time) for e in explanation: self.explain_engine.plot_explanation(e) return explanation
class Session: """ Session class controls the entire pipeline of HC """ def __init__(self, env, name="session"): """ Constructor for Holoclean session :param env: Holoclean environment :param name: Name for the Holoclean session """ # use DEBUG logging level if verbose enabled if env['verbose']: root_logger.setLevel(logging.DEBUG) gensim_logger.setLevel(logging.DEBUG) logging.debug('initiating session with parameters: %s', env) # Initialize random seeds. random.seed(env['seed']) torch.manual_seed(env['seed']) np.random.seed(seed=env['seed']) # Initialize members self.name = name self.env = env self.ds = Dataset(name, env) self.dc_parser = Parser(env, self.ds) self.domain_engine = DomainEngine(env, self.ds) self.detect_engine = DetectEngine(env, self.ds) self.repair_engine = RepairEngine(env, self.ds) self.eval_engine = EvalEngine(env, self.ds) def load_data(self, name, fpath, na_values=None, entity_col=None, src_col=None, exclude_attr_cols=None, numerical_attrs=None): """ load_data takes the filepath to a CSV file to load as the initial dataset. :param name: (str) name to initialize dataset with. :param fpath: (str) filepath to CSV file. :param na_values: (str) value that identifies a NULL value :param entity_col: (st) column containing the unique identifier/ID of an entity. For fusion tasks, rows with the same ID will be fused together in the output. If None, assumes every row is a unique entity. :param src_col: (str) if not None, for fusion tasks specifies the column containing the source for each "mention" of an entity. :param exclude_attr_cols: (str list) :param numerical_attrs: (str list) """ status, load_time = self.ds.load_data( name, fpath, na_values=na_values, entity_col=entity_col, src_col=src_col, exclude_attr_cols=exclude_attr_cols, numerical_attrs=numerical_attrs) logging.info(status) logging.debug('Time to load dataset: %.2f secs', load_time) def load_dcs(self, fpath): """ load_dcs ingests the Denial Constraints for initialized dataset. :param fpath: filepath to TXT file where each line contains one denial constraint. """ status, load_time = self.dc_parser.load_denial_constraints(fpath) logging.info(status) logging.debug('Time to load dirty data: %.2f secs', load_time) def get_dcs(self): return self.dc_parser.get_dcs() def detect_errors(self, detect_list): status, detect_time = self.detect_engine.detect_errors(detect_list) logging.info(status) logging.debug('Time to detect errors: %.2f secs', detect_time) def disable_quantize(self): self.do_quantization = False self.ds.do_quantization = False self.domain_engine.do_quantization = False def quantize_numericals(self, num_attr_groups_bins): """ :param num_attr_groups_bins: list[tuple] where each tuple consists of (# of bins, list[str]) where the list[str] is a group of attribues to be treated as numerical. """ self.do_quantization = True self.ds.do_quantization = True self.domain_engine.do_quantization = True status, quantize_time, quantized_data = \ quantize_km(self.env, self.ds.get_raw_data(), num_attr_groups_bins) logging.info(status) logging.debug('Time to quantize the dataset: %.2f secs' % quantize_time) self.load_quantized_data(quantized_data) return quantized_data def load_quantized_data(self, df): tic = time.time() name = self.ds.raw_data.name + '_quantized' self.ds.quantized_data = Table(name, Source.DF, df=df) # Re-store to DB, ensuring numerical values are stored as floats. df_correct_type = df.copy() for attr in self.ds.numerical_attrs: df_correct_type.loc[df_correct_type[attr] == NULL_REPR, attr] = np.nan df_correct_type[attr] = df_correct_type[attr].astype(float) df_correct_type.to_sql(name, self.ds.engine.engine, if_exists='replace', index=False, index_label=None) for attr in self.ds.quantized_data.get_attributes(): self.ds.quantized_data.create_db_index(self.ds.engine, [attr]) logging.debug('Time to load quantized dataset: %.2f secs' % (time.time() - tic)) def generate_domain(self): status, domain_time = self.domain_engine.setup() logging.info(status) logging.debug('Time to generate the domain: %.2f secs', domain_time) def run_estimator(self): """ Uses estimator to weak label and prune domain. """ self.domain_engine.run_estimator() def repair_errors(self, featurizers): return self._repair_errors(featurizers) def repair_validate_errors(self, featurizers, fpath, tid_col, attr_col, val_col, validate_period, na_values=None): return self._repair_errors(featurizers, fpath, tid_col, attr_col, val_col, na_values, validate_period) def _repair_errors(self, featurizers, fpath=None, tid_col=None, attr_col=None, val_col=None, na_values=None, validate_period=None): """ Repair errors and optionally runs validation set per epoch. Must specify the following parameters if validation required: :param fpath: (str) filepath to test set (ground truth) CSV file. :param tid_col: (str) column in CSV that corresponds to the TID. :param attr_col: (str) column in CSV that corresponds to the attribute. :param val_col: (str) column in CSV that corresponds to correct value for the current TID and attribute (i.e. cell). :param na_values: (Any) how na_values are represented in the data. :param validate_period: (int) perform validation every nth epoch. """ status, feat_time = self.repair_engine.setup_featurized_ds(featurizers) logging.info(status) logging.debug('Time to featurize data: %.2f secs', feat_time) status, setup_time = self.repair_engine.setup_repair_model() logging.info(status) logging.debug('Time to setup repair model: %.2f secs', feat_time) # If validation fpath provided, fit and validate if fpath is None: status, fit_time = self.repair_engine.fit_repair_model() else: # Set up validation set name = self.ds.raw_data.name + '_clean' status, load_time = self.eval_engine.load_data(name, fpath, tid_col, attr_col, val_col, na_values=na_values) logging.info(status) logging.debug('Time to evaluate repairs: %.2f secs', load_time) status, fit_time = self.repair_engine.fit_validate_repair_model( self.eval_engine, validate_period) logging.info(status) logging.debug('Time to fit repair model: %.2f secs', fit_time) status, infer_time = self.repair_engine.infer_repairs() logging.info(status) logging.debug('Time to infer correct cell values: %.2f secs', infer_time) status, time = self.ds.get_inferred_values() logging.info(status) logging.debug('Time to collect inferred values: %.2f secs', time) status, time = self.ds.get_repaired_dataset() logging.info(status) logging.debug('Time to store repaired dataset: %.2f secs', time) if self.env['print_fw']: status, time = self.repair_engine.get_featurizer_weights() logging.info(status) logging.debug('Time to store featurizer weights: %.2f secs', time) return status def evaluate(self, fpath, tid_col, attr_col, val_col, na_values=None): """ evaluate generates an evaluation report with metrics (e.g. precision, recall) given a test set. :param fpath: (str) filepath to test set (ground truth) CSV file. :param tid_col: (str) column in CSV that corresponds to the TID. :param attr_col: (str) column in CSV that corresponds to the attribute. :param val_col: (str) column in CSV that corresponds to correct value for the current TID and attribute (i.e. cell). :param na_values: (Any) how na_values are represented in the data. Returns an EvalReport named tuple containing the experiment results. """ name = self.ds.raw_data.name + '_clean' status, load_time = self.eval_engine.load_data(name, fpath, tid_col, attr_col, val_col, na_values=na_values) logging.info(status) logging.debug('Time to evaluate repairs: %.2f secs', load_time) status, report_time, eval_report = self.eval_engine.eval_report() logging.info(status) logging.debug('Time to generate report: %.2f secs', report_time) return eval_report def get_predictions(self): """ Returns a dataframe with 3 columns: - tid, attribute, inferred_val, proba """ query = """ SELECT _tid_, attribute, inferred_val, prob FROM {dom} INNER JOIN {inf_vals} USING(_vid_) """.format(inf_vals=AuxTables.inf_values_idx.name, dom=AuxTables.cell_domain.name) res = self.ds.engine.execute_query(query) df_preds = pd.DataFrame( res, columns=['tid', 'attribute', 'inferred_val', 'proba'], dtype=str) return df_preds
class Session: """ Session class controls the entire pipeline of HC """ def __init__(self, env, name="session"): """ Constructor for Holoclean session :param env: Holoclean environment :param name: Name for the Holoclean session """ # Initialize members self.name = name self.env = env self.ds = Dataset(name, env) self.dc_parser = Parser(env, self.ds) self.domain_engine = DomainEngine(env, self.ds) self.detect_engine = DetectEngine(env, self.ds) self.repair_engine = RepairEngine(env, self.ds) self.eval_engine = EvalEngine(env, self.ds) def load_data(self, name, f_path, f_name, na_values=None): status, load_time = self.ds.load_data(name, f_path, f_name, na_values=na_values) print(status) if self.env['verbose']: print('Time to load dataset: %.2f secs' % load_time) def load_dcs(self, f_path, f_name): status, load_time = self.dc_parser.load_denial_constraints( f_path, f_name) print(status) if self.env['verbose']: print('Time to load dirty data: %.2f secs' % load_time) def get_dcs(self): return self.dc_parser.get_dcs() def detect_errors(self, detect_list): status, detect_time = self.detect_engine.detect_errors(detect_list) print(status) if self.env['verbose']: print('Time to detect errors: %.2f secs' % detect_time) def setup_domain(self): status, domain_time = self.domain_engine.setup() print(status) if self.env['verbose']: print('Time to setup the domain: %.2f secs' % domain_time) def repair_errors(self, featurizers): status, feat_time = self.repair_engine.setup_featurized_ds(featurizers) print(status) if self.env['verbose']: print('Time to featurize data: %.2f secs' % feat_time) status, setup_time = self.repair_engine.setup_repair_model() print(status) if self.env['verbose']: print('Time to setup repair model: %.2f secs' % feat_time) status, fit_time = self.repair_engine.fit_repair_model() print(status) if self.env['verbose']: print('Time to fit repair model: %.2f secs' % fit_time) status, infer_time = self.repair_engine.infer_repairs() print(status) if self.env['verbose']: print('Time to infer correct cell values: %.2f secs' % infer_time) status, time = self.ds.get_inferred_values() print(status) if self.env['verbose']: print('Time to collect inferred values: %.2f secs' % time) status, time = self.ds.get_repaired_dataset() print(status) if self.env['verbose']: print('Time to store repaired dataset: %.2f secs' % time) def evaluate(self, f_path, f_name, get_tid, get_attr, get_value, na_values=None): name = self.ds.raw_data.name + '_clean' status, load_time = self.eval_engine.load_data(name, f_path, f_name, get_tid, get_attr, get_value, na_values=na_values) print(status) if self.env['verbose']: print('Time to evaluate repairs: %.2f secs' % load_time) status, report_time = self.eval_engine.eval_report() print(status) if self.env['verbose']: print('Time to generate report: %.2f secs' % report_time)