예제 #1
0
from ml.datasets.mohammad import MohammadDataSet
from ml.tools.openrefine.OpenRefine import OpenRefine

#one rule for all columns:
# if(contains(value, "x"), "error", value)
# takes 3 mins to execute

data = MohammadDataSet("tax", 20, 30, 10)

tool = OpenRefine(
    "/home/felix/SequentialPatternErrorDetection/OpenRefine/tax/result/tax_o20_r30_p10-csv-with-minus-rule.tsv",
    data=data)

print "Fscore: " + str(tool.calculate_total_fscore())
print "Precision: " + str(tool.calculate_total_precision())
print "Recall: " + str(tool.calculate_total_recall())

for c in range(data.shape[1]):
    print tool.calculate_fscore_by_column(c)
예제 #2
0
from sets import Set

from ml.datasets.mohammad import MohammadDataSet
from ml.tools.nadeef_repair.FD import FD
from ml.tools.nadeef_repair.NadeefAll import NadeefAll

data = MohammadDataSet("books", 30, 30, 10)

rules = []

#'''
#Mohammad's rule
rules.append(FD(Set(["first_author_varchar"]), "language_varchar"))
#'''

#rules.append(FD(Set(["first_author_varchar", "publish_date_varchar", "rating_varchar"]), "language_varchar"))
rules.append(FD(Set(["isbn13_varchar", "publisher_varchar", "rating_varchar", "title_varchar"]), "first_author_varchar"))
rules.append(FD(Set(["description_varchar", "first_author_varchar", "format_varchar", "title_varchar"]), "isbn13_varchar"))





nadeef = NadeefAll(data, rules)
from ml.datasets.mohammad import MohammadDataSet
from ml.tools.dboost.TestDBoost import test

data = MohammadDataSet("bikes", 30, 0, 20)

sample_size = 10
steps = 100

test(data, sample_size, steps)
예제 #4
0
from ml.datasets.mohammad import MohammadDataSet
from ml.tools.dboost.TestDBoost import run_params_gaussian

data = MohammadDataSet("cars", 30, 20, 20)

sample_size = 10
steps = 100

best_params = {}
best_params['gaussian'] = 1.0
best_params['statistical'] = 0.5
run_params_gaussian(data, best_params)