from ml.datasets.mohammad import MohammadDataSet from ml.tools.openrefine.OpenRefine import OpenRefine #one rule for all columns: # if(contains(value, "x"), "error", value) # takes 3 mins to execute data = MohammadDataSet("tax", 20, 30, 10) tool = OpenRefine( "/home/felix/SequentialPatternErrorDetection/OpenRefine/tax/result/tax_o20_r30_p10-csv-with-minus-rule.tsv", data=data) print "Fscore: " + str(tool.calculate_total_fscore()) print "Precision: " + str(tool.calculate_total_precision()) print "Recall: " + str(tool.calculate_total_recall()) for c in range(data.shape[1]): print tool.calculate_fscore_by_column(c)
from sets import Set from ml.datasets.mohammad import MohammadDataSet from ml.tools.nadeef_repair.FD import FD from ml.tools.nadeef_repair.NadeefAll import NadeefAll data = MohammadDataSet("books", 30, 30, 10) rules = [] #''' #Mohammad's rule rules.append(FD(Set(["first_author_varchar"]), "language_varchar")) #''' #rules.append(FD(Set(["first_author_varchar", "publish_date_varchar", "rating_varchar"]), "language_varchar")) rules.append(FD(Set(["isbn13_varchar", "publisher_varchar", "rating_varchar", "title_varchar"]), "first_author_varchar")) rules.append(FD(Set(["description_varchar", "first_author_varchar", "format_varchar", "title_varchar"]), "isbn13_varchar")) nadeef = NadeefAll(data, rules)
from ml.datasets.mohammad import MohammadDataSet from ml.tools.dboost.TestDBoost import test data = MohammadDataSet("bikes", 30, 0, 20) sample_size = 10 steps = 100 test(data, sample_size, steps)
from ml.datasets.mohammad import MohammadDataSet from ml.tools.dboost.TestDBoost import run_params_gaussian data = MohammadDataSet("cars", 30, 20, 20) sample_size = 10 steps = 100 best_params = {} best_params['gaussian'] = 1.0 best_params['statistical'] = 0.5 run_params_gaussian(data, best_params)