def test_simple_holdout(): # ******* setting up DINTModel dm = SchemaMatcher(host="localhost", port=8080) # dictionary with features single_feature_config = { "activeFeatures": [ "num-unique-vals", "prop-unique-vals", "prop-missing-vals", "ratio-alpha-chars", "prop-numerical-chars", "prop-whitespace-chars", "prop-entries-with-at-sign" ] } # resampling strategy resampling_strategy = "NoResampling" dint_model = DINTModel(dm, single_feature_config, resampling_strategy, "DINTModel with simple feature config", debug_csv=os.path.join( "results", "debug_dint_simple_holdout.csv")) models = [dint_model] loo_experiment = Experiment( models, experiment_type="repeated_holdout", description="repeated_holdout_0.5_2", result_csv=os.path.join('results', "performance_simple_holdout.csv"), debug_csv=os.path.join("results", "debug_simple_holdout.csv"), holdout=0.5, num=2) loo_experiment.run()
def test_fullfeature_resampletomean(): # ******* setting up DINTModel dm = SchemaMatcher(host="localhost", port=8080) # dictionary with features full_feature_config = { "activeFeatures": [ "num-unique-vals", "prop-unique-vals", "prop-missing-vals", "ratio-alpha-chars", "prop-numerical-chars", "prop-whitespace-chars", "prop-entries-with-at-sign", "prop-entries-with-hyphen", "prop-entries-with-paren", "prop-entries-with-currency-symbol", "mean-commas-per-entry", "mean-forward-slashes-per-entry", "prop-range-format", "is-discrete", "entropy-for-discrete-values" ], "activeFeatureGroups": [ "stats-of-text-length", "stats-of-numerical-type", "prop-instances-per-class-in-knearestneighbours", "mean-character-cosine-similarity-from-class-examples", "min-editdistance-from-class-examples", "min-wordnet-jcn-distance-from-class-examples", "min-wordnet-lin-distance-from-class-examples" ], "featureExtractorParams": [{ "name": "prop-instances-per-class-in-knearestneighbours", "num-neighbours": 5 }, { "name": "min-wordnet-jcn-distance-from-class-examples", "max-comparisons-per-class": 5 }, { "name": "min-wordnet-lin-distance-from-class-examples", "max-comparisons-per-class": 5 }] } # resampling strategy resampling_strategy = "ResampleToMean" dint_model = DINTModel( dm, full_feature_config, resampling_strategy, "DINTModel with full feature config and resampleToMean and filtered types and no parallel", debug_csv=os.path.join("results", "debug_dint_full_resampletomean_no.csv")) # models for experiments models = [dint_model] loo_experiment = Experiment(models, experiment_type="leave_one_out", description="plain loo", result_csv=os.path.join( 'results', "performance_resample_filter_no.csv"), debug_csv=os.path.join("results", "debug_resample.csv")) loo_experiment.run()
def test_resampletomean(): # ******* setting up DINTModel dm = SchemaMatcher(host="localhost", port=8080) logging.info("Cleaning models from DINT server") for m in dm.models: dm.remove_model(m) logging.info("Cleaning datasets from DINT server") for ds in dm.datasets: dm.remove_dataset(ds) m1 = create_dint_model(dm, "full", "ResampleToMean") m2 = create_dint_model(dm, "single", "ResampleToMean") m3 = create_dint_model(dm, "full_chardist", "ResampleToMean") m4 = create_dint_model(dm, "noheader", "ResampleToMean") m5 = create_dint_model(dm, "chardistonly", "ResampleToMean") models = [m1, m2, m3, m4, m5] loo_experiment = Experiment( models, experiment_type="leave_one_out", description="plain loo", result_csv=os.path.join('results', "performance_dint_resampletomean.csv"), debug_csv=os.path.join("results", "debug_dint_resampletomean.csv")) loo_experiment.run()
def test_models_holdout(): # ******* setting up DINTModel dm = SchemaMatcher(host="localhost", port=8080) logging.info("Cleaning models from DINT server") for m in dm.models: dm.remove_model(m) logging.info("Cleaning datasets from DINT server") for ds in dm.datasets: dm.remove_dataset(ds) m1 = create_dint_model(dm, "full", "NoResampling") m2 = create_dint_model(dm, "single", "NoResampling") m3 = create_dint_model(dm, "chardist", "NoResampling") m4 = create_dint_model(dm, "noheader", "NoResampling") m5 = create_dint_model(dm, "chardistonly", "NoResampling") rf_model = NNetModel(['rf@charfreq'], 'rf@charfreq model: no headers', add_headers=False, p_header=0, debug_csv=os.path.join("results", "debug_nnet_rf_holdout.csv")) models = [m1, m2, m3, m4, m5, rf_model] rhold_experiment = Experiment( models, experiment_type="repeated_holdout", description="repeated_holdout_0.5_10", result_csv=os.path.join('results', "performance_models_holdout.csv"), debug_csv=os.path.join("results", "debug_holdout.csv"), holdout=0.5, num=10) rhold_experiment.run()
def test_simple(ignore_uknown=True, domains=None): # ******* setting up DINTModel dm = SchemaMatcher(host="localhost", port=8080) # dictionary with features single_feature_config = { "activeFeatures": [ "num-unique-vals", "prop-unique-vals", "prop-missing-vals", "ratio-alpha-chars", "prop-numerical-chars", "shannon-entropy", "prop-whitespace-chars", "prop-entries-with-at-sign" ] } # resampling strategy resampling_strategy = "NoResampling" dint_model = DINTModel(dm, single_feature_config, resampling_strategy, "DINTModel with simple feature config", debug_csv=os.path.join("results", "debug_dint_simple.csv"), ignore_unknown=ignore_uknown) models = [dint_model] loo_experiment = Experiment(models, experiment_type="leave_one_out", description="plain loo", result_csv=os.path.join( 'results', "performance_simple.csv"), debug_csv=os.path.join("results", "debug_simple.csv")) weapons = [ "www.theoutdoorstrader.com.csv", "www.tennesseegunexchange.com.csv", "www.montanagunclassifieds.com.csv", "www.kyclassifieds.com.csv", "www.hawaiiguntrader.com.csv", "www.gunsinternational.com.csv", "www.floridaguntrader.com.csv", "www.floridagunclassifieds.com.csv", "www.elpasoguntrader.com.csv", "www.dallasguns.com.csv", "www.armslist.com.csv", "www.alaskaslist.com.csv" ] if domains: loo_experiment.change_domains(domains) loo_experiment.run()
def test_singlefeatures(): # ******* setting up DINTModel dm = SchemaMatcher(host="localhost", port=8080) # dictionary with features single_feature_config = { "activeFeatures": [ "num-unique-vals", "prop-unique-vals", "prop-missing-vals", "ratio-alpha-chars", "prop-numerical-chars", "prop-whitespace-chars", "prop-entries-with-at-sign", "prop-entries-with-hyphen", "prop-entries-with-paren", "prop-entries-with-currency-symbol", "mean-commas-per-entry", "mean-forward-slashes-per-entry", "prop-range-format", "is-discrete", "entropy-for-discrete-values" ] } # resampling strategy resampling_strategy = "BaggingToMax" dint_model = DINTModel( dm, single_feature_config, resampling_strategy, "DINTModel with single feature config and baggingtomax and filtered types and no parallel", debug_csv=os.path.join("results", "debug_dint_single_no.csv")) models = [dint_model] loo_experiment = Experiment(models, experiment_type="leave_one_out", description="plain loo", result_csv=os.path.join( 'results', "performance_bagging_filter_no.csv"), debug_csv=os.path.join("results", "debug_bagging.csv")) loo_experiment.run()
backupCount=5, encoding=None, delay=0) my_handler.setFormatter(log_formatter) my_handler.setLevel(logging.DEBUG) # logging.basicConfig(filename=log_file, # level=logging.DEBUG, filemode='w+', # format='%(asctime)s %(levelname)s %(module)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') log = logging.getLogger() log.setLevel(logging.DEBUG) log.addHandler(my_handler) ########################## establish connections to the servers dsl = KarmaSession(host="localhost", port=8000) dm = SchemaMatcher(host="localhost", port=8080) resampling_strategies = ["NoResampling", "ResampleToMean", "Bagging"] features = ["chardist-edit", "chardistonly", "fullchardist", "fullcity"] domains = None experiments = ["leave_one_out", "repeated_holdout"] ######################ignore unmapped attributes########################## print("Setting ignore_unknown: ", True) for experiment_type in experiments: models = create_models(dm, dsl, experiment_type, True, resampling_strategies, features) experiment = Experiment(models,
except ImportError as e: import sys sys.path.insert(0, '.') import serene from serene.matcher.core import SchemaMatcher import os.path # # First setup the example dataset path... # EXAMPLE_DATASET = os.path.join('../tests', 'resources', 'medium.csv') # connect to the server... # host and port are for now taken from the config.settings.py file dm = SchemaMatcher(host="localhost", port=8080) # # lists all the datasets... # print(dm.datasets) # [ # <DataSet(gr7y7ydf)>, # <DataSet(sdfg764t)>, # <DataSet(98793875)> # ] # # Show a summary of all datasets as a Pandas data frame # print(dm.datasets.summary)