def test_hospital_without_init(): db_name = random_database() try: # 1. Setup a HoloClean session. hc = holoclean.HoloClean(db_name='holo', domain_thresh_1=0.0, domain_thresh_2=0.0, weak_label_thresh=0.99, max_domain=10000, cor_strength=0.6, nb_cor_strength=0.8, epochs=10, weight_decay=0.01, learning_rate=0.001, threads=1, batch_size=1, verbose=True, timeout=3 * 60000, feature_norm=False, weight_norm=False, print_fw=True).session # 2. Load training data and denial constraints. hc.load_data('hospital', '../testdata/hospital.csv') hc.load_dcs('../testdata/hospital_constraints.txt') hc.ds.set_constraints(hc.get_dcs()) # 3. Detect erroneous cells using these two detectors. detectors = [NullDetector(), ViolationDetector()] hc.detect_errors(detectors) # 4. Repair errors utilizing the defined features. hc.setup_domain() featurizers = [ OccurAttrFeaturizer(), FreqFeaturizer(), ConstraintFeaturizer(), ] hc.repair_errors(featurizers) # 5. Evaluate the correctness of the results. report = hc.evaluate(fpath='../testdata/hospital_clean.csv', tid_col='tid', attr_col='attribute', val_col='correct_val') # We assert that our key metrics are exactly as tested for hospital. # If these assertions ever fail in a new change, the results should # be comparable if not better than before, unless a clear and correct # reason can be given. assert report.correct_repairs == 434 assert report.total_repairs == 456 assert abs(report.precision - 434. / 456) < TOL assert abs(report.recall - 434. / 509) < TOL assert abs(report.repair_recall - 434. / 435) < TOL assert report.total_repairs_grdt_correct == 22 finally: delete_database(db_name)
def template(featurizers, estimator_type): db_name = random_database() try: # 1. Setup a HoloClean session. hc = holoclean.HoloClean( db_name=db_name, domain_thresh_1=0.0, domain_thresh_2=0.0, weak_label_thresh=0.99, max_domain=10000, cor_strength=0.6, nb_cor_strength=0.8, epochs=10, weight_decay=0.01, learning_rate=0.001, threads=1, batch_size=1, verbose=True, timeout=3 * 60000, print_fw=True, estimator_type=estimator_type, ).session # 2. Load training data and denial constraints. hc.load_data('hospital', '../testdata/hospital/hospital.csv') hc.load_dcs('../testdata/hospital/hospital_constraints.txt') hc.ds.set_constraints(hc.get_dcs()) # 3. Detect erroneous cells using these two detectors. detectors = [NullDetector(), ViolationDetector()] hc.detect_errors(detectors) # 4. Repair errors utilizing the defined features. hc.generate_domain() hc.run_estimator() hc.repair_errors(featurizers) # 5. Evaluate the correctness of the results. report = hc.evaluate(fpath='../testdata/hospital/hospital_clean.csv', tid_col='tid', attr_col='attribute', val_col='correct_val') return report finally: delete_database(db_name)
import holoclean from detect import * from repair.featurize import * # 1. Setup a HoloClean session. hc = holoclean.HoloClean( db_name='holo', domain_thresh_1=0.0, domain_thresh_2=0.0, weak_label_thresh=0.99, max_domain=10000, cor_strength=0.6, nb_cor_strength=0.8, weight_decay=0.01, learning_rate=0.001, threads=1, batch_size=1, verbose=True, timeout=3 * 60000, print_fw=True, ).session # 2. Load training data and denial constraints. hc.load_data('hospital', '../testdata/hospital/hospital.csv') hc.load_dcs('../testdata/hospital/hospital_constraints.txt') hc.ds.set_constraints(hc.get_dcs()) # 3. Detect erroneous cells using these two detectors. detectors = [NullDetector(), ViolationDetector()] hc.detect_errors(detectors)
def hc_repair(data_name, tmp_path, num_attrs, paras, attr=None, n_val=-1): """ Filling value by value in the synthetic dataset :param data_name string for the true dataset :param tmp_path temporary path for partial data :param num_attrs a list of numeric attributes :param paras parameters :param n_val number of value to predict. Set None to predicate entire domain Return a list of dumped predicates from autoencoder """ m = paras.get('MCMC', 0) path_preds = None if m > 0: assert attr is not None # load the model from previous save, without training dir_preds = os.path.abspath(f"./_models") os.makedirs(dir_preds, exist_ok=True) path_preds = f"{dir_preds}/model_{data_name}_{attr}.pkl" if os.path.exists(path_preds): with open(path_preds, 'rb') as input: df_preds = pickle.load(input) logging.info('DONE with loading model from file') return df_preds if n_val is not None: # n_val is None for weight learning # train the model in the standard way, or the saved model does not exist for mcmc n_val_limit, n_try = _get_sampling_paras(paras.get('AR', False)) n_val = n_val_limit * n_try hc = holoclean.HoloClean(db_name='db4kamino', domain_thresh_1=0.0, domain_thresh_2=0.0, max_domain=10000, cor_strength=0, weight_decay=0., learning_rate=0.001, threads=1, batch_size=1, verbose=False, timeout=3 * 60000, infer_mode='dk', privacy=paras['dp'], delta=paras['delta'], iterations=paras['iterations'], noise_multiplier=paras['noise_multiplier'], l2_norm_clip=paras['l2_norm_clip'], minibatch_size=paras['minibatch_size'], microbatch_size=paras['microbatch_size']).session hc.load_data(data_name, tmp_path, numerical_attrs=num_attrs) detectors = [NullDetector()] hc.detect_errors(detectors) num_attr_groups = [] quantized_num = [] num_attrs_quant = _get_num_attrs_quant(data_name) for num_attr in num_attrs: num_attr_groups.append([num_attr]) if num_attrs_quant is not None and num_attr in num_attrs_quant: quantized_num.append((num_attrs_quant[num_attr], [num_attr])) hc.quantize_numericals(quantized_num) hc.generate_domain() embedfest = EmbeddingFeaturizer(reuse_embedding=paras['reuse_embedding'], numerical_attr_groups=num_attr_groups) embedfest.setup_featurizer(hc.env, hc.ds) if m > 0: assert attr is not None and path_preds is not None df_preds = embedfest.embedding_model.dump_predictions_hm( n_val_limit=n_val, include_std=True, fpath=path_preds) else: df_preds = embedfest.embedding_model.gen_predictions(n_val_limit=n_val, include_std=True) logging.info('DONE with training the autoencoder model') return df_preds
def __init__(self, data_df): hc = holoclean.HoloClean(db_name='holo', domain_thresh_1=0, domain_thresh_2=0, weak_label_thresh=0.99, max_domain=10000, cor_strength=0.6, nb_cor_strength=0.8, epochs=10, weight_decay=0.01, learning_rate=0.001, threads=1, batch_size=1, verbose=False, timeout=1 * 600, feature_norm=False, weight_norm=False, print_fw=False).session hc.load_data('hospital', data_df) hc.load_dcs('./temp_constraints.txt') hc.ds.set_constraints(hc.get_dcs()) hc.setup_domain(list(data_df.columns)) return hc def create_constraints_file(relevant_attr): fr = open('./testdata/hospital_constraints.txt') fw = open('./temp_constraints.txt', "w+") attributes_to_keep = set() for line in fr: for attr in relevant_attr: if attr in line: fw.write(line) for item in line.split("t2.")[1:]: attributes_to_keep.add(item.split(")")[0]) fr.close() fw.close() return attributes_to_keep def holoclean_detect(hc): detectors = [NullDetector(), ViolationDetector()] featurizers = [ InitAttrFeaturizer(), OccurAttrFeaturizer(), FreqFeaturizer(), ConstraintFeaturizer(), ] hc.detect_errors(detectors) hc.repair_errors(featurizers) return hc def run_holoclean(df, columns): relevant_attributes = create_constraints_file(columns) df_in = df.copy() df_in = df_in[relevant_attributes] hc = holoclean_init(df_in) hc = holoclean_detect(hc) return hc
import holoclean from detect import NullDetector, ViolationDetector from repair.featurize import InitFeaturizer from repair.featurize import InitAttFeaturizer from repair.featurize import InitSimFeaturizer from repair.featurize import FreqFeaturizer from repair.featurize import OccurFeaturizer from repair.featurize import ConstraintFeat from repair.featurize import LangModelFeat # 1. Setup a HoloClean session. hc = holoclean.HoloClean(pruning_topk=0.1, epochs=30, weight_decay=0.01, threads=20, batch_size=1, verbose=True, timeout=3 * 60000).session # 2. Load training data and denial constraints. hc.load_data('hospital', 'data', 'hospital.csv') hc.load_dcs('data', 'hospital_constraints_att.txt') hc.ds.set_constraints(hc.get_dcs()) # 3. Detect erroneous cells using these two detectors. detectors = [NullDetector(), ViolationDetector()] hc.detect_errors(detectors) # 4. Repair errors utilizing the defined features. hc.setup_domain()
import holoclean from holoclean.detect import * from holoclean.repair.featurize import * # 1. Setup a HoloClean session. hc = holoclean.HoloClean( sqlalchemy_uri="postgresql://*****:*****@localhost:5432/superset", domain_thresh_1=0.0, domain_thresh_2=0.0, weak_label_thresh=0.99, max_domain=10000, cor_strength=0.6, nb_cor_strength=0.8, weight_decay=0.01, learning_rate=0.001, threads=1, batch_size=1, verbose=True, timeout=3 * 60000, print_fw=True, ).session # 2. Load training data and denial constraints. hc.load_data('hospital', '../testdata/hospital/hospital.csv') hc.load_dcs('../testdata/hospital/hospital_constraints.txt') hc.ds.set_constraints(hc.get_dcs()) # 3. Detect erroneous cells using these two detectors. detectors = [NullDetector(), ViolationDetector()] hc.detect_errors(detectors)