def test_scenario1(self): """ Scenario: Successfully building a new dataset from an existing one Given I create a BigML dataset from "<data>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I create a new BigML dataset using the specs in JSON file "<new_fields>" and a model with "<model_fields>" And I check that the new dataset has been created And I check that the model has been created Then I check that the new dataset has field "<field>" Examples: |data |output_dir |new_fields | field | model_fields |../data/iris.csv | ./scenario_d_1 |../data/new_fields.json| outlier? |petal length,outlier?,species """ print self.test_scenario1.__doc__ examples = [[ 'data/iris.csv', 'scenario_d_1', 'data/new_fields.json', u'outlier?', u'petal length,outlier?,species' ]] for example in examples: print "\nTesting with:\n", example dataset_adv.i_create_dataset(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) dataset_adv.i_create_dataset_new_fields(self, json_file=example[2], model_fields=example[4]) test_pred.i_check_create_new_dataset(self) test_pred.i_check_create_model(self) dataset_adv.i_check_dataset_has_field(self, example[3])
def test_scenario6(self): """ Scenario 6: Successfully building remote test predictions from scratch to a dataset: Given I create BigML resources uploading train "<data>" file to test "<test>" remotely to a dataset with no CSV output and log resources in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I check that the source has been created from the test file And I check that the dataset has been created from the test file And I check that the batch prediction has been created Then I check that the batch predictions dataset exists And no local CSV file is created Examples: | data | test | output_dir | | ../data/iris.csv | ../data/test_iris.csv | ./scenario_r5 | """ print self.test_scenario6.__doc__ examples = [['data/iris.csv', 'data/test_iris.csv', 'scenario_r5']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_all_resources_batch_to_dataset( self, data=example[0], test=example[1], output_dir=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) test_batch_pred.i_check_create_test_source(self) test_batch_pred.i_check_create_test_dataset(self) test_batch_pred.i_check_create_batch_prediction(self) test_batch_pred.i_check_create_batch_predictions_dataset(self) anomaly_pred.i_check_no_local_CSV(self)
def test_scenario9(self): """ Scenario: Successfully building random fields analysis from dataset: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML random fields analysis with <kfold>-cross-validation improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-random trees have been created And I check that all the <kfold>-fold cross-validations have been created Then the best random candidates number is "<random_candidates>", with "<metric>" of <metric_value> Examples: | data | output | kfold | metric | random_candidates | metric_value | | ../data/iris.csv | ./scenario_a_11/evaluation |2 | precision | 4 | 96.09% | """ print self.test_scenario9.__doc__ examples = [ ['data/iris.csv', 'scenario_a_11/evaluation', '2', 'precision', '4', '96.09%']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_random_analysis(self, k_fold=example[2], metric=example[3]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_random_forest(self, example[2]) test_pred.i_check_create_all_kfold_cross_validations(self, example[2]) test_pred.i_check_random_candidates(self, example[4], example[3], example[5])
def test_scenario9(self): """ Scenario: Sucessfully deleting resources in a time range and with a tag: Given I create a BigML source from file "<data>" storing results in "<output_dir>" And I check that the source has been created And I store the source id as lower And I create a BigML source from file "<data>" with tag "<tag1>" storing results in "<output_dir>" And I check that the source exists And I create a BigML dataset from the source with tag "<tag1>" storing results in "<output_dir2>" And I check that the dataset exists And I delete the resources using --newer-than and --all-tag "<tag1>" storing results in "<output_dir3>" Then I check that the source doesn't exist And I check that the dataset doesn't exist Examples: | data | output_dir | tag1 | output_dir2 | output_dir3 | ../data/iris.csv | ./scenario_del_9 | my_tag1 | ./scenario_del_9_2 | ./scenario_del_9_3 """ print self.test_scenario9.__doc__ examples = [["data/iris.csv", "scenario_del_9", "my_tag1", "scenario_del_9_2", "scenario_del_9_3"]] for example in examples: print "\nTesting with:\n", example test_delete.i_create_source_from_file(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_delete.i_store_source_id_as_bound(self, which="lower") test_delete.i_create_source_from_file_with_tag(self, data=example[0], tag=example[2], output_dir=example[3]) test_delete.i_check_source_exists(self) test_delete.i_store_source_id_as_bound(self, which="reference") test_delete.i_create_dataset_from_source_with_tag(self, tag=example[2], output_dir=example[4]) test_delete.i_check_dataset_exists(self) test_delete.i_delete_resources_newer_and_tag(self, tag=example[2], output_dir=example[1]) test_delete.i_check_source_does_not_exist(self, source_id=world.source_reference) test_delete.i_check_dataset_does_not_exist(self, dataset_id=world.dataset) test_delete.i_check_source_exists_by_id(self, source_id=world.source_lower)
def test_scenario3(self): """ Scenario: Successfully building evaluations from start: Given I create BigML resources uploading train "<data>" file to create model and log in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I evaluate "<test>" with proportional missing strategy And I check that the source has been created And I check that the dataset has been created And I check that the evaluation has been created Then the evaluation file is like "<json_evaluation_file>" Examples: | data | test | output | json_evaluation_file | | ../data/iris.csv | ../data/iris_nulls.csv | ./scenario_mis_3/evaluation | ./check_files/evaluation_iris_nulls.json | """ print self.test_scenario3.__doc__ examples = [ ['data/iris.csv', 'data/iris_nulls.csv', 'scenario_mis_3/evaluation', 'check_files/evaluation_iris_nulls.json']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_all_resources_to_model(self, data=example[0], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) evaluation.i_create_proportional_to_evaluate(self, test=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_check_create_evaluation(self) evaluation.then_the_evaluation_file_is_like(self, example[3])
def test_scenario5(self): """ Scenario: Successfully building nodes threshold analysis from dataset file: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML nodes analysis from dataset file from <min_nodes> to <max_nodes> by <nodes_step> with <kfold>-cross-validation improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best node threshold is "<node_threshold>", with "<metric>" of <metric_value> Examples: | data | output | min_nodes | max_nodes | nodes_step | kfold | metric | node_threshold | metric_value | ../data/iris.csv | ./scenario_a_4/evaluation | 3 | 14 | 2 |2 | precision | 9 | 94.71% """ print self.test_scenario5.__doc__ examples = [ ['data/iris.csv', 'scenario_a_4/evaluation', '3', '14', '2', '2', 'precision', '9', '94.71%']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_nodes_analysis_from_dataset_file( self, min_nodes=example[2], max_nodes=example[3], nodes_step=example[4], k_fold=example[5], metric=example[6]) test_pred.i_check_create_kfold_datasets(self, example[5]) test_pred.i_check_create_kfold_models(self, example[5]) test_pred.i_check_create_all_kfold_cross_validations(self, example[5]) test_pred.i_check_node_threshold(self, example[7], example[6], example[8])
def test_scenario4(self): """ Scenario: Successfully building feature selection from filtered dataset setting objective: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I filter out field "<field>" from dataset and log to "<output_dir>" And I check that the new dataset has been created And I create BigML feature selection <kfold>-fold cross-validations for "<objective>" improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> Examples: | data | field | objective |output | output_dir | kfold | metric | selection | metric_value | | ../data/iris_2fd.csv | sepal length | species |./scenario_a_6/evaluation |./scenario_a_6 | 2 | recall | petal width | 100.00% | """ print self.test_scenario4.__doc__ examples = [ ['data/iris_2fd.csv', 'sepal length', 'species', 'scenario_a_6/evaluation', 'scenario_a_6', '2', 'recall', 'petal width', '100.00%']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[3]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) dataset.i_filter_field_from_dataset(self, field=example[1], output_dir=example[4]) test_pred.i_check_create_new_dataset(self) test_pred.i_create_kfold_cross_validation_objective(self, k_folds=example[5], objective=example[2], metric=example[6]) test_pred.i_check_create_kfold_datasets(self, example[5]) test_pred.i_check_create_kfold_models(self, example[5]) test_pred.i_check_create_all_kfold_cross_validations(self, example[5]) test_pred.i_check_feature_selection(self, example[7], example[6], example[8])
def test_scenario1(self): """ Scenario: Successfully exporting models with params in the available languages: Given I create BigML resources uploading train "<data>" file using "<source_attributes>" and log in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I export the model as a function in "<language>"to "<output>" Then the export file is like "<check_file>" Examples: | data | source_attributes | output | language | check_file | ../data/movies.csv | data/movies_source_attrs.json | ./scenario_exp_1/model | python | model_function.py """ print self.test_scenario1.__doc__ examples = [ ['data/movies.csv', 'data/movies_source_attrs.json', 'scenario_exp_1_a/model', 'python', 'check_files/export/model_function.py'], ['data/movies.csv', 'data/movies_source_attrs.json', 'scenario_exp_1_b/model', 'javascript', 'check_files/export/model_function.js'], ['data/movies.csv', 'data/movies_source_attrs.json', 'scenario_exp_1_c/model', 'r', 'check_files/export/model_function.R'], ['data/iris.csv', '', 'scenario_exp_1_d/model', 'tableau', 'check_files/export/model_function.tb'], ['data/iris.csv', '', 'scenario_exp_1_e/model', 'mysql', 'check_files/export/model_function.sql'], ['data/libros.csv', 'data/libros_source_attrs.json', 'scenario_exp_1_f/model', 'python', 'check_files/export/model_function_utf8.py'], ['data/libros.csv', 'data/libros_source_attrs.json', 'scenario_exp_1_g/model', 'r', 'check_files/export/model_function_utf8.R'], ['data/libros.csv', 'data/libros_source_attrs.json', 'scenario_exp_1_h/model', 'javascript', 'check_files/export/model_function_utf8.js']] for example in examples: print "\nTesting with:\n", example export.i_create_all_resources_to_model_with_source_attrs( \ self, data=example[0], source_attributes=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_check_create_model(self) export.i_export_model(self, language=example[3], output=example[2]) export.i_check_if_the_output_is_like_expected_file( \ self, language=example[3], expected_file=example[4])
def test_scenario1(self): """ Scenario: Successfully building test predictions with proportional missing strategy: Given I create BigML resources uploading train "<data>" file to test "<test>" with proportional missing strategy and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: | data | test | output |predictions_file | | ../data/iris.csv | ../data/test_iris_nulls.csv | ./scenario_mis_1/predictions.csv | ./check_files/predictions_iris_nulls.csv | """ print self.test_scenario1.__doc__ examples = [[ 'data/iris.csv', 'data/test_iris_nulls.csv', 'scenario_mis_1/predictions.csv', 'check_files/predictions_iris_nulls.csv' ]] for example in examples: print "\nTesting with:\n", example test_pred.i_create_all_resources_proportional(self, data=example[0], test=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[3])
def test_scenario3(self): """ Scenario: Successfully building feature selection from dataset setting objective: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validations for "<objective>" improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> Examples: | data | objective |output | kfold | metric | selection | metric_value | | ../data/iris_2f.csv | 0 |./scenario_a_5/evaluation | 2 | r_squared| species | 0.352845 | | ../data/iris_2f.csv | 0 |./scenario_a_8/evaluation | 2 | mean_squared_error| species | 0.475200 | """ print self.test_scenario3.__doc__ examples = [ ['data/iris_2f.csv', '0', 'scenario_a_5/evaluation', '2', 'r_squared', 'species', '0.352845'], ['data/iris_2f.csv', '0', 'scenario_a_8/evaluation', '2', 'mean_squared_error', 'species', '0.475200']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_objective(self, k_folds=example[3], objective=example[1], metric=example[4]) test_pred.i_check_create_kfold_datasets(self, example[3]) test_pred.i_check_create_kfold_models(self, example[3]) test_pred.i_check_create_all_kfold_cross_validations(self, example[3]) test_pred.i_check_feature_selection(self, example[5], example[4], example[6])
def setup_scenario1(self): """ Scenario: Successfully building test anomaly scores from scratch: Given I create BigML resources uploading train "<data>" file to create anomaly scores for "<test>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the anomaly detector has been created And I check that the anomaly scores are ready Then the local anomaly scores file is like "<predictions_file>" Examples: | data | test | output |predictions_file | | ../data/tiny_kdd.csv | ../data/test_kdd.csv | ./scenario_an_1/anomaly_scores.csv | ./check_files/anomaly_scores_kdd.csv | """ print self.setup_scenario1.__doc__ examples = [[ 'data/tiny_kdd.csv', 'data/test_kdd.csv', 'scenario_an_1/anomaly_scores.csv', 'check_files/anomaly_scores_kdd.csv' ]] for example in examples: print "\nTesting with:\n", example test_anomaly.i_create_all_anomaly_resources(self, data=example[0], test=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_anomaly.i_check_create_anomaly(self) test_anomaly.i_check_create_anomaly_scores(self) test_anomaly.i_check_anomaly_scores(self, example[3])
def test_scenario7(self): """ Scenario: Successfully building anomalous dataset test predictions from anomaly Given I create BigML anomaly detector from data <data> with options <options> and generate a new dataset of anomalies in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I check that the anomaly detector has been created Then I check that the new top anomalies dataset has been created And the top anomalies in the anomaly detector are <top_anomalies> And the forest size in the anomaly detector is <forest_size> And the number of records in the top anomalies dataset is <top_anomalies> Examples: | data | options | output_dir | top_anomalies | forest_size | | data/tiny_kdd.csv" | --top-anomalies 15 --forest-size 40 | scenario_an_7 | 15 | 40 | """ print self.test_scenario7.__doc__ examples = [[ 'data/tiny_kdd.csv', '--top-n 15 --forest-size 40 ', 'scenario_an_7', '15', '40' ]] for example in examples: print "\nTesting with:\n", example test_anomaly.i_create_anomaly_resources_with_options( self, example[0], example[1], output_dir=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_anomaly.i_check_create_anomaly(self) test_pred.i_check_create_dataset(self, suffix='gen ') test_anomaly.i_check_top_anomalies(self, example[3]) test_anomaly.i_check_forest_size(self, example[4]) test_anomaly.i_check_dataset_lines_number(self, example[3])
def test_scenario01(self): """ Scenario: Successfully building deepnet test predictions from start with no headers: Given I create BigML deepnet resources uploading train "<data>" file with no headers to test "<test>" with no headers and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the deepnet model has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: | data | test | output |predictions_file | | ../data/iris_nh.csv | ../data/test_iris_nh.csv | ./scenario1_dn_nh/predictions.csv | ./check_files/predictions_iris_dn.csv | """ print self.test_scenario01.__doc__ examples = [[ 'data/iris_nh.csv', 'data/test_iris_nh.csv', 'scenario1_dn_nh/predictions.csv', 'check_files/predictions_iris_dn.csv' ]] for example in examples: print "\nTesting with:\n", example dn_pred.i_create_all_dn_resources_with_no_headers( self, example[0], example[1], example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) dn_pred.i_check_create_dn_model(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[3])
def test_scenario1(self): """ Scenario: Successfully generating reports in Gazibit: Given I create BigML resources and share them uploading train "<data>" file to evaluate and log evaluation and reports in "<output>" And I check that the source has been created And I check that the dataset has been created and shared And I check that the model has been created and shared Then I check that the evaluation has been created and shared And I check that the Gazibit report has been created And I check that the Gazibit shared report has been created Examples: | data | output | | ../data/iris.csv | ./scenario_rpt_1/evaluation | """ print self.test_scenario1.__doc__ examples = [ ['data/iris.csv', 'scenario_rpt_1/evaluation']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_all_resources_to_evaluate_and_report(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset_shared(self) test_pred.i_check_create_model_shared(self) test_pred.i_check_create_evaluation_shared(self) test_pred.i_check_gazibit_reports(self, shared=None) test_pred.i_check_gazibit_reports(self, shared='shared ')
def test_scenario5(self): """ Scenario: Successfully building nodes threshold analysis from dataset file: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML nodes analysis from dataset file from <min_nodes> to <max_nodes> by <nodes_step> with <kfold>-cross-validation improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best node threshold is "<node_threshold>", with "<metric>" of <metric_value> Examples: | data | output | min_nodes | max_nodes | nodes_step | kfold | metric | node_threshold | metric_value | ../data/iris.csv | ./scenario_a_4/evaluation | 3 | 14 | 2 |2 | precision | 9 | 94.71% """ print self.test_scenario5.__doc__ examples = [ ['data/iris.csv', 'scenario_a_4/evaluation', '3', '14', '2', '2', 'precision', '9', '94.71%']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_nodes_analysis_from_dataset_file( self, min_nodes=example[2], max_nodes=example[3], nodes_step=example[4], k_fold=example[5], metric=example[6]) test_pred.i_check_create_kfold_datasets(self, example[5]) test_pred.i_check_create_kfold_models(self, example[5]) test_pred.i_check_create_all_kfold_cross_validations(self, example[5]) test_pred.i_check_node_threshold(self, example[7], example[6], example[8])
def test_scenario7(self): """ Scenario: Successfully importing fields summary to a dataset Given I create a BigML dataset from "<data>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I import fields attributes in file "<summary_file>" to dataset Then the field "<field_id>" has "<attribute>" equal to "<attribute_value>" Examples: |data |output_dir | summary_file | field_id | attribute | attribute_value |../data/iris.csv | ./scenario_d_7 | fields_summary_modified.csv | 000000 | name | sepal_length """ print self.test_scenario7.__doc__ examples = [[ 'data/iris.csv', 'scenario_d_7', 'data/fields_summary_modified.csv', '000000', 'name', 'sepal_length' ]] for example in examples: print "\nTesting with:\n", example dataset_adv.i_create_dataset(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) dataset_adv.i_import_fields(self, summary=example[2]) dataset_adv.field_attribute_value(self, field=example[3], attribute=example[4], attribute_value=example[5])
def test_scenario6(self): """ Scenario: Successfully building feature selection from dataset excluding features: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validations excluding "<features>" with separator "<args_separator>" improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> Examples: | data | output | kfold | features | args_separator | metric | selection | metric_value | | ../data/iris.csv | ./scenario_a_7/evaluation | 2 | petal length!sepal width | ! | accuracy | petal width | 95.33% | """ print self.test_scenario6.__doc__ examples = [ ['data/iris.csv', 'scenario_a_7/evaluation', '2', 'petal length!sepal width', '!', 'accuracy', 'petal width', '95.33%']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_separator_metric_no_fields(self, k_folds=example[2], features=example[3], args_separator=example[4], metric=example[5]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_models(self, example[2]) test_pred.i_check_create_all_kfold_cross_validations(self, example[2]) test_pred.i_check_feature_selection(self, example[6], example[5], example[7])
def test_scenario2(self): """ Scenario: Successfully building remote test centroid predictions from scratch to dataset: Given I create BigML resources uploading train "<data>" file to find centroids for "<test>" remotely to dataset with no CSV and log resources in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I check that the cluster has been created And I check that the source has been created from the test file And I check that the dataset has been created from the test file And I check that the batch centroid prediction has been created Then I check that the batch centroids dataset exists And no local CSV file is created Examples: | data | test | output_dir | | ../data/grades.csv | ../data/test_grades.csv | ./scenario_cb_2 | """ print self.test_scenario2.__doc__ examples = [ ['data/grades.csv', 'data/test_grades.csv', 'scenario_cb_2']] for example in examples: print "\nTesting with:\n", example test_cluster.i_create_all_cluster_resources_to_dataset(self, data=example[0], test=example[1], output_dir=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_cluster(self) test_pred.i_check_create_test_source(self) test_pred.i_check_create_test_dataset(self) batch_pred.i_check_create_batch_centroid(self) batch_pred.i_check_create_batch_centroids_dataset(self) test_anomaly.i_check_no_local_CSV(self)
def test_scenario7(self): """ Scenario: Successfully building feature selection for a category from dataset: Given I create BigML dataset uploading train "<data>" file with attributes "<attributes>" in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validations improving "<metric>" for category "<category>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> Examples: | data | attributes | output | kfold | metric | category | selection | metric_value | ../data/spam.csv | ../data/spam_attributes.json |./scenario_a_9/evaluation | 2 | recall | spam | Message | 58.69% """ print self.test_scenario7.__doc__ examples = [ ['data/spam.csv', 'data/spam_attributes.json', 'scenario_a_9/evaluation', '2', 'recall', 'spam', 'Message', '58.69%']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset_with_attributes(self, data=example[0], attributes=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_metric_category(self, k_folds=example[3], metric=example[4], category=example[5]) test_pred.i_check_create_kfold_datasets(self, example[3]) test_pred.i_check_create_kfold_models(self, example[3]) test_pred.i_check_create_all_kfold_cross_validations(self, example[3]) test_pred.i_check_feature_selection(self, example[6], example[4], example[7])
def test_scenario3(self): """ Scenario: Successfully building feature selection from dataset setting objective: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validations for "<objective>" improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> Examples: | data | objective |output | kfold | metric | selection | metric_value | | ../data/iris_2f.csv | 0 |./scenario_a_5/evaluation | 2 | r_squared| species | 0.352845 | | ../data/iris_2f.csv | 0 |./scenario_a_8/evaluation | 2 | mean_squared_error| species | 0.475200 | """ print self.test_scenario3.__doc__ examples = [ ['data/iris_2f.csv', '0', 'scenario_a_5/evaluation', '2', 'r_squared', 'species', '0.352845'], ['data/iris_2f.csv', '0', 'scenario_a_8/evaluation', '2', 'mean_squared_error', 'species', '0.475200']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_objective(self, k_folds=example[3], objective=example[1], metric=example[4]) test_pred.i_check_create_kfold_datasets(self, example[3]) test_pred.i_check_create_kfold_models(self, example[3]) test_pred.i_check_create_all_kfold_cross_validations(self, example[3]) test_pred.i_check_feature_selection(self, example[5], example[4], example[6])
def test_scenario8(self): """ Scenario: Successfully building a new dataset from an existing one and analyzing it Given I create a BigML dataset from "<data>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I create a new BigML dataset using the specs in JSON file "<new_fields>" and a model with "<model_fields>" And I check that the new dataset has been created And I check that the model has been created And I create BigML nodes analysis from <min_nodes> to <max_nodes> by <nodes_step> with <kfold>-cross-validation improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best node threshold is "<node_threshold>", with "<metric>" of <metric_value> Examples: |data |output_dir |new_fields | field | model_fields| min_nodes | max_nodes | nodes_step | kfold | metric | node_threshold | metric_value | |../data/iris.csv | ./scenario_a_10 |../data/new_fields.json| outlier? |petal length,outlier?,species| 3 | 14 | 2 |2 | precision | 9 | 94.71% | """ print self.test_scenario8.__doc__ examples = [ ['data/iris.csv', 'scenario_a_10', 'data/new_fields2.json', u'outlier?', u'outlier?,species', '3', '14', '2', '2', 'precision', '5', '98.21%']] for example in examples: print "\nTesting with:\n", example dataset_adv.i_create_dataset(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) dataset_adv.i_create_dataset_new_fields(self, json_file=example[2], model_fields=example[4]) test_pred.i_check_create_new_dataset(self) test_pred.i_check_create_model(self) test_pred.i_create_nodes_analysis(self, min_nodes=example[5], max_nodes=example[6], nodes_step=example[7], k_fold=example[8], metric=example[9]) test_pred.i_check_create_kfold_datasets(self, example[8]) test_pred.i_check_create_kfold_models(self, example[8]) test_pred.i_check_create_all_kfold_cross_validations(self, example[8]) test_pred.i_check_node_threshold(self, example[10], example[9], example[11])
def test_scenario7(self): """ Scenario: Successfully building feature selection for a category from dataset: Given I create BigML dataset uploading train "<data>" file with attributes "<attributes>" in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validations improving "<metric>" for category "<category>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> Examples: | data | attributes | output | kfold | metric | category | selection | metric_value | ../data/spam.csv | ../data/spam_attributes.json |./scenario_a_9/evaluation | 2 | recall | spam | Message | 61.24% """ print self.test_scenario7.__doc__ examples = [ ['data/spam.csv', 'data/spam_attributes.json', 'scenario_a_9/evaluation', '2', 'recall', 'spam', 'Message', '61.24%']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset_with_attributes(self, data=example[0], attributes=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_metric_category(self, k_folds=example[3], metric=example[4], category=example[5]) test_pred.i_check_create_kfold_datasets(self, example[3]) test_pred.i_check_create_kfold_models(self, example[3]) test_pred.i_check_create_all_kfold_cross_validations(self, example[3]) test_pred.i_check_feature_selection(self, example[6], example[4], example[7])
def test_scenario9(self): """ Scenario: Successfully building random fields analysis from dataset: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML random fields analysis with <kfold>-cross-validation improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-random trees have been created And I check that all the <kfold>-fold cross-validations have been created Then the best random candidates number is "<random_candidates>", with "<metric>" of <metric_value> Examples: | data | output | kfold | metric | random_candidates | metric_value | | ../data/iris.csv | ./scenario_a_11/evaluation |2 | precision | 4 | 96.09% | """ print self.test_scenario9.__doc__ examples = [ ['data/iris.csv', 'scenario_a_11/evaluation', '2', 'precision', '4', '96.09%']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_random_analysis(self, k_fold=example[2], metric=example[3]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_random_forest(self, example[2]) test_pred.i_check_create_all_kfold_cross_validations(self, example[2]) test_pred.i_check_random_candidates(self, example[4], example[3], example[5])
def test_scenario2(self): """ Scenario: Successfully building feature selection from dataset: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validations improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> And I generate a report from the output directory And a symlink file is generated in the reports directory Examples: | data | output | kfold | metric | selection | metric_value | ../data/iris_2f.csv | ./scenario_a_2/evaluation | 2 | accuracy | petal width | 100.00% | ../data/iris_2f.csv | ./scenario_a_3/evaluation | 2 | phi | petal width | 1 """ print self.test_scenario2.__doc__ examples = [ ['data/iris_2f.csv', 'scenario_a_2/evaluation', '2', 'accuracy', 'petal width', '100.00%'], ['data/iris_2f.csv', 'scenario_a_3/evaluation', '2', 'phi', 'petal width', '1']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_metric(self, k_folds=example[2], metric=example[3]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_models(self, example[2]) test_pred.i_check_create_all_kfold_cross_validations(self, example[2]) test_pred.i_check_feature_selection(self, example[4], example[3], example[5]) test_pred.i_generate_report(self) test_pred.is_symlink(self)
def test_scenario11(self): """ Scenario: Successfully building feature selection from dataset setting objective: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validation with options "<options>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-ensembles have been created And I check that all the <kfold>-fold cross-validations have been created Then the predictions file "<predictions_file>" is like "<estimated_file>" Examples: | data |output | kfold | options | predictions_file | estimated_file | ../data/iris.csv |./scenario_a_14/evaluation | 2 | --exclude-features="species,petal length" --predictions.csv --number-of-models 2| scenario_a_14/kfold2_pred/predictions.csv | check_files/analyze_predictions_iris_e.csv """ print self.test_scenario11.__doc__ examples = [ ['data/iris.csv', 'scenario_a_14/evaluation', '2', ' --exclude-features="petal length,sepal length" --predictions-csv --number-of-models 2','scenario_a_14/test/kfold2_pred/predictions.csv', 'check_files/analyze_predictions_iris_e.csv']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_options(self, k_folds=example[2], options=example[3]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_ensembles(self, example[2]) test_pred.i_check_create_all_kfold_cross_validations(self, example[2]) test_pred.i_check_predictions_file(self, example[4], example[5])
def test_scenario4(self): """ Scenario: Successfully building a multi-dataset Given I create a BigML dataset from "<data>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I create a BigML dataset from previous source and store logs in "<output_dir>" And I check that the dataset has been created And I create a multi-dataset from the datasets file and store logs in "<output_dir2>" And I check that the multi-dataset has been created Then I check that the multi-dataset's origin are the datasets in "<output_dir>" Examples: |data |output_dir |output_dir2 | |../data/iris.csv | ./scenario_d_4 | ./scenario_d_4a| """ print self.test_scenario4.__doc__ examples = [['data/iris.csv', 'scenario_d_4', 'scenario_d_4a']] for example in examples: print "\nTesting with:\n", example dataset_adv.i_create_dataset(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) dataset_adv.i_create_dataset_from_source(self, output_dir=example[1]) test_pred.i_check_create_dataset(self, suffix=None) dataset_adv.i_create_multi_dataset(self, example[2]) dataset_adv.i_check_create_multi_dataset(self) dataset_adv.i_check_multi_dataset_origin(self, output_dir=example[1])
def test_scenario1(self): """ Scenario: Successfully building k-fold cross-validation from dataset: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML <kfold>-fold cross-validation And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that the <kfold>-fold cross-validation has been created Then the evaluation file is like "<json_evaluation_file>" Examples: | data | output | kfold | json_evaluation_file | | ../data/iris.csv | ./scenario_a_1/evaluation | 2 | ./check_files/evaluation_kfold.json | """ print self.test_scenario1.__doc__ examples = [ ['data/iris.csv', 'scenario_a_1/evaluation', '2', 'check_files/evaluation_kfold.json']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation(self, k_folds=example[2]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_models(self, example[2]) test_pred.i_check_create_kfold_cross_validation(self, example[2]) evaluation.then_the_evaluation_file_is_like(self, example[3])
def setup_scenario1(self): """ Scenario: Successfully building multi-label test predictions from start: Given I create BigML multi-label resources tagged as "<tag>" with "<label_separator>" label separator and <number_of_labels> labels uploading train "<data>" file with "<training_separator>" field separator and <number_of_models> models ensembles to test "<test>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the models in the ensembles have been created Then I check that the predictions are ready Examples: |tag |label_separator |number_of_labels | data |training_separator |number_of_models | test | output | |my_multilabel_1|:|7| ../data/multilabel.csv |,|10| ../data/test_multilabel.csv | ./scenario_mle_1/predictions.csv """ print self.setup_scenario1.__doc__ examples = [[ 'my_multilabel_1%s' % PY3, ':', '7', 'data/multilabel.csv', ',', '10', 'data/test_multilabel.csv', 'scenario_mle_1/predictions.csv' ]] for example in examples: print "\nTesting with:\n", example ml_pred.i_create_all_ml_resources_and_ensembles( self, tag=example[0], label_separator=example[1], number_of_labels=example[2], data=example[3], training_separator=example[4], number_of_models=example[5], test=example[6], output=example[7]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_check_create_models_in_ensembles(self, in_ensemble=True) test_pred.i_check_create_predictions(self)
def test_scenario2(self): """ Scenario: Successfully building feature selection from dataset: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validations improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> And I generate a report from the output directory And a symlink file is generated in the reports directory Examples: | data | output | kfold | metric | selection | metric_value | ../data/iris_2f.csv | ./scenario_a_2/evaluation | 2 | accuracy | petal width | 100.00% | ../data/iris_2f.csv | ./scenario_a_3/evaluation | 2 | phi | petal width | 1 """ print self.test_scenario2.__doc__ examples = [ ['data/iris_2f.csv', 'scenario_a_2/evaluation', '2', 'accuracy', 'petal width', '100.00%'], ['data/iris_2f.csv', 'scenario_a_3/evaluation', '2', 'phi', 'petal width', '1']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_metric(self, k_folds=example[2], metric=example[3]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_models(self, example[2]) test_pred.i_check_create_all_kfold_cross_validations(self, example[2]) test_pred.i_check_feature_selection(self, example[4], example[3], example[5]) test_pred.i_generate_report(self) test_pred.is_symlink(self)
def test_scenario2(self): """ Scenario: Successfully associating resources to an existing project: Given I create a BigML source uploading train "<data>" file and associate it to a new project named "<project>" storing results in "<output_dir>" And I check that the project has been created And I check that the source has been created And I create a BigML source uploading train "<data>" file and associate it to the last created project id storing results in "<output_dir2>" Then the source is associated to the project Examples: | data | project | output_dir | output_dir2 | ../data/iris.csv | My new project | ./scenario_p_2 | ./scenario_p_2_1 """ print self.test_scenario2.__doc__ examples = [[ 'data/iris.csv', 'My new project', 'scenario_p_2', 'scenario_p_2_1' ]] for example in examples: print "\nTesting with:\n", example test_project.i_create_source_with_project(self, data=example[0], project=example[1], output_dir=example[2]) test_project.i_check_create_project(self) test_pred.i_check_create_source(self) test_project.i_create_source_with_project_id(self, data=example[0], output_dir=example[3]) test_project.check_source_in_project(self)
def test_scenario11(self): """ Scenario: Successfully building feature selection from dataset setting objective: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validation with options "<options>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-ensembles have been created And I check that all the <kfold>-fold cross-validations have been created Then the predictions file "<predictions_file>" is like "<estimated_file>" Examples: | data |output | kfold | options | predictions_file | estimated_file | ../data/iris.csv |./scenario_a_14/evaluation | 2 | --exclude-features="species,petal length" --predictions.csv --number-of-models 2| scenario_a_14/kfold2_pred/predictions.csv | check_files/analyze_predictions_iris_e.csv """ print self.test_scenario11.__doc__ examples = [ ['data/iris.csv', 'scenario_a_14/evaluation', '2', ' --exclude-features="petal length,sepal length" --predictions-csv --number-of-models 2','scenario_a_14/test/kfold2_pred/predictions.csv', 'check_files/analyze_predictions_iris_e.csv']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_options(self, k_folds=example[2], options=example[3]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_ensembles(self, example[2]) test_pred.i_check_create_all_kfold_cross_validations(self, example[2]) test_pred.i_check_predictions_file(self, example[4], example[5])
def test_scenario6(self): """ Scenario: Successfully exporting fields summary from a dataset Given I create a BigML dataset from "<data>" and a summary file "<summary_file>" for its fields and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created Then the expected field "<expected_file>" is like "<summary_file>" Examples: |data |output_dir | summary_file | expected_file |../data/iris.csv | ./scenario_d_6 | fields_summary.csv | check_files/fields_summary.csv """ print self.test_scenario6.__doc__ examples = [[ 'data/iris.csv', 'scenario_d_6', 'fields_summary.csv', 'check_files/fields_summary.csv' ]] for example in examples: print "\nTesting with:\n", example dataset_adv.i_create_dataset_with_summary(self, data=example[0], summary_file=example[2], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) dataset_adv.i_files_equal(self, example[2], example[3])
def test_scenario5(self): """ Scenario: Successfully building evaluation from model and test file with data map Given I have previously executed "<scenario>" or reproduce it with arguments <kwargs> And I create BigML resources using test file "<test>" and a fields map "<fields_map>" to evaluate a model and log evaluation in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the evaluation has been created Then the evaluation file is like "<json_evaluation_file>" Examples: |scenario | kwargs | test | fields_map | output | json_evaluation_file | | scenario_e1| {"data": "../data/iris.csv", "output": "./scenario_e1/predictions.csv"} | ../data/iris_permuted.csv | ../data/fields_map.csv | ./scenario_e7/evaluation | ./check_files/evaluation_iris2.json | """ print self.test_scenario5.__doc__ examples = [[ 'scenario_e1', '{"data": "data/iris.csv", "output": "scenario_e1/predictions.csv"}', 'data/iris_permuted.csv', 'data/fields_map.csv', 'scenario_e7/evaluation', 'check_files/evaluation_iris2.json' ]] for example in examples: print "\nTesting with:\n", example test_pred.i_have_previous_scenario_or_reproduce_it( self, example[0], example[1]) evaluation.i_create_all_resources_to_evaluate_with_model_and_map( self, data=example[2], fields_map=example[3], output=example[4]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_evaluation(self) evaluation.then_the_evaluation_file_is_like(self, example[5])
def test_scenario11(self): """ Scenario: Successfully building association from a sampled dataset Given I create a BigML dataset from "<data>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I create a BigML association with params "<params>" from dataset in "<output_dir>" And I check that the association has been created And the association params are "<params_json>" Examples: |data |output_dir | params | params_json |../data/iris.csv | ./scenario_d_11 | "--sample-rate 0.2 --replacement" | {"sample-rate": 0.2, "replacement": true} """ print self.test_scenario11.__doc__ examples = [[ 'data/iris.csv', 'scenario_d_11', '--sample-rate 0.2 --replacement', '{"sample_rate": 0.2, "replacement": true}' ]] for example in examples: print "\nTesting with:\n", example dataset_adv.i_create_dataset(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) dataset_adv.i_create_association_with_params_from_dataset( \ self, params=example[2], output_dir=example[1]) test_pred.i_check_create_association(self) dataset_adv.i_check_association_params(self, params_json=example[3])
def setup_scenario1(self): """ Scenario: Successfully building evaluations from start: Given I create BigML resources uploading train "<data>" file to evaluate and log evaluation in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I check that the evaluation has been created Then the evaluation file is like "<json_evaluation_file>" Examples: | data | output | json_evaluation_file | | ../data/iris.csv | ./scenario_e1/evaluation | ./check_files/evaluation_iris.json | """ print self.setup_scenario1.__doc__ examples = [[ 'data/iris.csv', 'scenario_e1/evaluation', 'check_files/evaluation_iris.json' ]] for example in examples: print "\nTesting with:\n", example test_pred.i_create_all_resources_to_evaluate(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) test_pred.i_check_create_evaluation(self) evaluation.then_the_evaluation_file_is_like(self, example[2])
def test_scenario1(self): """ Scenario: Successfully building test centroid predictions from scratch: Given I create BigML resources uploading train "<data>" file to find centroids for "<test>" remotely with mapping file "<fields_map>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the cluster has been created And I check that the source has been created from the test file And I check that the dataset has been created from the test file And I check that the batch centroid prediction has been created And I check that the centroids are ready Then the local centroids file is like "<predictions_file>" Examples: | data | test | fields_map | output |predictions_file | | ../data/grades.csv | ../data/grades_perm.csv | ../data/grades_fields_map_perm.csv | ./scenario_cb_1_r/centroids.csv | ./check_files/centroids_grades.csv | """ print self.test_scenario1.__doc__ examples = [ ['data/grades.csv', 'data/grades_perm.csv', 'data/grades_fields_map_perm.csv', 'scenario_cb_1_r/centroids.csv', 'check_files/centroids_grades.csv']] for example in examples: print "\nTesting with:\n", example test_cluster.i_create_all_cluster_resources_with_mapping(self, data=example[0], test=example[1], fields_map=example[2], output=example[3]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_cluster(self) test_pred.i_check_create_test_source(self) test_pred.i_check_create_test_dataset(self) batch_pred.i_check_create_batch_centroid(self) test_cluster.i_check_create_centroids(self) test_pred.i_check_predictions(self, example[4])
def test_scenario1(self): """ Scenario: Successfully building test predictions from dataset specifying objective field and model fields Given I create a BigML dataset from "<data>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I create BigML resources using dataset, objective field <objective> and model fields <fields> to test "<test>" and log predictions in "<output>" And I check that the model has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: |data | output_dir | test | output |predictions_file | objective | fields | | ../data/iris_2fb.csv| ./scénario1 | ../data/test_iris2fb.csv | ./scénario1/predictions.csv | ./check_files/predictions_iris_2fb.csv | spécies | "pétal width" | """ print self.test_scenario1.__doc__ examples = [ ['data/iris_2fb.csv', u'scénario1', 'data/test_iris2fb.csv', u'scénario1/predictions.csv', 'check_files/predictions_iris_2fb.csv', u'spécies', u'"pétal width"']] for example in examples: print "\nTesting with:\n", example dataset_adv.i_create_dataset(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_create_resources_from_dataset_objective_model(self, objective=example[5], fields=example[6], test=example[2], output=example[3]) test_pred.i_check_create_model(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[4])
def test_scenario1(self): """ Scenario: Successfully building test centroids from scratch: Given I create BigML resources uploading train "<data>" file to create centroids for "<test>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the cluster has been created And I check that the centroids are ready Then the local centroids file is like "<predictions_file>" Examples: | data | test | output |predictions_file | | ../data/grades.csv | ../data/grades.csv | ./scenario_c_1_r/centroids.csv | ./check_files/centroids_grades.csv | | ../data/diabetes.csv | ../data/diabetes.csv | ./scenario_c_1/centroids.csv | ./check_files/centroids_diabetes.csv | """ print self.test_scenario1.__doc__ examples = [ ['data/grades.csv', 'data/grades.csv', 'scenario_c_1_r/centroids.csv', 'check_files/centroids_grades.csv'], ['data/diabetes.csv', 'data/diabetes.csv', 'scenario_c_1/centroids.csv', 'check_files/centroids_diabetes.csv']] for example in examples: print "\nTesting with:\n", example test_cluster.i_create_all_cluster_resources(self, data=example[0], test=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_cluster(self) test_cluster.i_check_create_centroids(self) test_pred.i_check_predictions(self, example[3])
def test_scenario1(self): """ Scenario: Successfully building test predictions with missing-splits model: Given I create BigML resources uploading train "<data>" file to test "<test>" with a missing-splits model and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: | data | test | output |predictions_file | | ../data/iris_missing.csv | ../data/test_iris_missing.csv | ./scenario_mspl_1/predictions.csv | ./check_files/predictions_iris_missing.csv | """ print self.test_scenario1.__doc__ examples = [ ['data/iris_missing.csv', 'data/test_iris_missing.csv', 'scenario_mspl_1/predictions.csv', 'check_files/predictions_iris_missing.csv']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_all_resources_missing_splits(self, data=example[0], test=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[3])
def test_scenario2(self): """ Given I create BigML resources uploading train "<data>" file to test "<test>" remotely with proportional missing strategy and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I check that the source has been created from the test file And I check that the dataset has been created from the test file And I check that the batch prediction has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: | data | test | output |predictions_file | | ../data/iris.csv | ../data/test_iris_nulls.csv | ./scenario_mis_2/predictions.csv | ./check_files/predictions_iris_nulls.csv """ print self.test_scenario2.__doc__ examples = [ ['data/iris.csv', 'data/test_iris_nulls.csv', 'scenario_mis_2/predictions.csv', 'check_files/predictions_iris_nulls.csv']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_all_resources_remote_proportional(self, data=example[0], test=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) test_pred.i_check_create_test_source(self) test_pred.i_check_create_test_dataset(self) test_pred.i_check_create_batch_prediction(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[3])
def test_scenario7(self): """ Scenario: Successfully building anomalous dataset test predictions from anomaly Given I create BigML anomaly detector from data <data> with options <options> and generate a new dataset of anomalies in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I check that the anomaly detector has been created Then I check that the new top anomalies dataset has been created And the top anomalies in the anomaly detector are <top_anomalies> And the forest size in the anomaly detector is <forest_size> And the number of records in the top anomalies dataset is <top_anomalies> Examples: | data | options | output_dir | top_anomalies | forest_size | | data/tiny_kdd.csv" | --top-anomalies 15 --forest-size 40 | scenario_an_7 | 15 | 40 | """ print self.test_scenario7.__doc__ examples = [ ['data/tiny_kdd.csv', '--top-n 15 --forest-size 40 ', 'scenario_an_7', '15', '40']] for example in examples: print "\nTesting with:\n", example test_anomaly.i_create_anomaly_resources_with_options(self, example[0], example[1], output_dir=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_anomaly.i_check_create_anomaly(self) test_pred.i_check_create_dataset(self, suffix='gen ') test_anomaly.i_check_top_anomalies(self, example[3]) test_anomaly.i_check_forest_size(self, example[4]) test_anomaly.i_check_dataset_lines_number(self, example[3])
def test_scenario4(self): """ Scenario: Successfully building feature selection from filtered dataset setting objective: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I filter out field "<field>" from dataset and log to "<output_dir>" And I check that the new dataset has been created And I create BigML feature selection <kfold>-fold cross-validations for "<objective>" improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> Examples: | data | field | objective |output | output_dir | kfold | metric | selection | metric_value | | ../data/iris_2fd.csv | sepal length | species |./scenario_a_6/evaluation |./scenario_a_6 | 2 | recall | petal width | 100.00% | """ print self.test_scenario4.__doc__ examples = [ ['data/iris_2fd.csv', 'sepal length', 'species', 'scenario_a_6/evaluation', 'scenario_a_6', '2', 'recall', 'petal width', '100.00%']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[3]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) dataset.i_filter_field_from_dataset(self, field=example[1], output_dir=example[4]) test_pred.i_check_create_new_dataset(self) test_pred.i_create_kfold_cross_validation_objective(self, k_folds=example[5], objective=example[2], metric=example[6]) test_pred.i_check_create_kfold_datasets(self, example[5]) test_pred.i_check_create_kfold_models(self, example[5]) test_pred.i_check_create_all_kfold_cross_validations(self, example[5]) test_pred.i_check_feature_selection(self, example[7], example[6], example[8])
def setup_scenario1(self): """ Scenario: Successfully building test anomaly scores from scratch: Given I create BigML resources uploading train "<data>" file to create anomaly scores for "<test>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the anomaly detector has been created And I check that the anomaly scores are ready Then the local anomaly scores file is like "<predictions_file>" Examples: | data | test | output |predictions_file | | ../data/tiny_kdd.csv | ../data/test_kdd.csv | ./scenario_an_1/anomaly_scores.csv | ./check_files/anomaly_scores_kdd.csv | """ print self.setup_scenario1.__doc__ examples = [ ['data/tiny_kdd.csv', 'data/test_kdd.csv', 'scenario_an_1/anomaly_scores.csv', 'check_files/anomaly_scores_kdd.csv']] for example in examples: print "\nTesting with:\n", example test_anomaly.i_create_all_anomaly_resources(self, data=example[0], test=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_anomaly.i_check_create_anomaly(self) test_anomaly.i_check_create_anomaly_scores(self) test_anomaly.i_check_anomaly_scores(self, example[3])
def test_scenario6(self): """ Scenario: Successfully building feature selection from dataset excluding features: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validations excluding "<features>" with separator "<args_separator>" improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> Examples: | data | output | kfold | features | args_separator | metric | selection | metric_value | | ../data/iris.csv | ./scenario_a_7/evaluation | 2 | petal length!sepal width | ! | accuracy | petal width | 95.33% | """ print self.test_scenario6.__doc__ examples = [ ['data/iris.csv', 'scenario_a_7/evaluation', '2', 'petal length!sepal width', '!', 'accuracy', 'petal width', '95.33%']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_separator_metric_no_fields(self, k_folds=example[2], features=example[3], args_separator=example[4], metric=example[5]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_models(self, example[2]) test_pred.i_check_create_all_kfold_cross_validations(self, example[2]) test_pred.i_check_feature_selection(self, example[6], example[5], example[7])
def test_scenario4(self): """ Scenario: Successfully building test anomaly score predictions from training set as a dataset: Given I create BigML resources uploading train "<data>" file to find anomaly scores for the training set remotely saved to dataset with no CSV output and log resources in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I check that the anomaly detector has been created And I check that the batch anomaly scores prediction has been created Then I check that the batch anomaly scores dataset exists And no local CSV file is created Examples: | data | output_dir | | ../data/iris.csv | ./scenario_ab_4 | """ print self.test_scenario3.__doc__ examples = [ ['data/iris.csv', 'scenario_ab_4']] for example in examples: print "\nTesting with:\n", example test_anomaly.i_create_all_anomaly_resources_without_test_split(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_anomaly.i_check_create_anomaly(self) test_batch.i_check_create_batch_anomaly_scores(self) test_anomaly.i_check_create_batch_anomaly_score_dataset(self) test_anomaly.i_check_no_local_CSV(self)
def test_scenario8(self): """ Scenario: Successfully building a new dataset from an existing one and analyzing it Given I create a BigML dataset from "<data>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I create a new BigML dataset using the specs in JSON file "<new_fields>" and a model with "<model_fields>" And I check that the new dataset has been created And I check that the model has been created And I create BigML nodes analysis from <min_nodes> to <max_nodes> by <nodes_step> with <kfold>-cross-validation improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best node threshold is "<node_threshold>", with "<metric>" of <metric_value> Examples: |data |output_dir |new_fields | field | model_fields| min_nodes | max_nodes | nodes_step | kfold | metric | node_threshold | metric_value | |../data/iris.csv | ./scenario_a_10 |../data/new_fields.json| outlier? |petal length,outlier?,species| 3 | 14 | 2 |2 | precision | 9 | 94.71% | """ print self.test_scenario8.__doc__ examples = [ ['data/iris.csv', 'scenario_a_10', 'data/new_fields2.json', u'outlier?', u'outlier?,species', '3', '14', '2', '2', 'precision', '5', '98.21%']] for example in examples: print "\nTesting with:\n", example dataset_adv.i_create_dataset(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) dataset_adv.i_create_dataset_new_fields(self, json_file=example[2], model_fields=example[4]) test_pred.i_check_create_new_dataset(self) test_pred.i_check_create_model(self) test_pred.i_create_nodes_analysis(self, min_nodes=example[5], max_nodes=example[6], nodes_step=example[7], k_fold=example[8], metric=example[9]) test_pred.i_check_create_kfold_datasets(self, example[8]) test_pred.i_check_create_kfold_models(self, example[8]) test_pred.i_check_create_all_kfold_cross_validations(self, example[8]) test_pred.i_check_node_threshold(self, example[10], example[9], example[11])
def test_scenario1(self): """ Scenario: Successfully building test anomaly score predictions from scratch: Given I create BigML resources uploading train "<data>" file to find anomaly scores for "<test>" remotely with mapping file "<fields_map>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the anomaly detector has been created And I check that the source has been created from the test file And I check that the dataset has been created from the test file And I check that the batch anomaly scores prediction has been created And I check that the anomaly scores are ready Then the local anomaly scores file is like "<predictions_file>" Examples: | data | test | fields_map | output |predictions_file | | ../data/grades.csv | ../data/grades_perm.csv | ../data/grades_fields_map_perm.csv | ./scenario_ab_1_r/anomalies.csv | ./check_files/anomaly_scores_grades.csv | """ print self.test_scenario1.__doc__ examples = [ ['data/grades.csv', 'data/grades_perm.csv', 'data/grades_fields_map_perm.csv', 'scenario_ab_1_r/anomalies.csv', 'check_files/anomaly_scores_grades.csv']] for example in examples: print "\nTesting with:\n", example test_anomaly.i_create_all_anomaly_resources_with_mapping(self, data=example[0], test=example[1], fields_map=example[2], output=example[3]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_anomaly.i_check_create_anomaly(self) test_pred.i_check_create_test_source(self) test_pred.i_check_create_test_dataset(self) test_batch.i_check_create_batch_anomaly_scores(self) test_anomaly.i_check_create_anomaly_scores(self) test_anomaly.i_check_anomaly_scores(self, example[4])
def test_scenario1(self): """ Scenario: Successfully building k-fold cross-validation from dataset: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML <kfold>-fold cross-validation And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that the <kfold>-fold cross-validation has been created Then the evaluation file is like "<json_evaluation_file>" Examples: | data | output | kfold | json_evaluation_file | | ../data/iris.csv | ./scenario_a_1/evaluation | 2 | ./check_files/evaluation_kfold.json | """ print self.test_scenario1.__doc__ examples = [ ['data/iris.csv', 'scenario_a_1/evaluation', '2', 'check_files/evaluation_kfold.json']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation(self, k_folds=example[2]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_models(self, example[2]) test_pred.i_check_create_kfold_cross_validation(self, example[2]) evaluation.then_the_evaluation_file_is_like(self, example[3])
def test_scenario2(self): """ Scenario: Successfully building test anomaly score predictions from test split: Given I create BigML resources uploading train "<data>" file to find anomaly scores with test split "<test_split>" remotely and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the anomaly detector has been created And I check that the train dataset has been created And I check that the dataset has been created from the test file And I check that the batch anomaly scores prediction has been created And I check that the anomaly scores are ready Then the local anomaly scores file is like "<predictions_file>" Examples: | data | test_split | output |predictions_file | | ../data/iris.csv | 0.2 | ./scenario_ab_2/anomalies.csv | ./check_files/anomaly_scores_iris.csv | """ print self.test_scenario2.__doc__ examples = [ ['data/iris.csv', '0.2', 'scenario_ab_2/anomalies.csv', 'check_files/anomaly_scores_iris.csv']] for example in examples: print "\nTesting with:\n", example test_anomaly.i_create_all_anomaly_resources_with_test_split(self, data=example[0], test_split=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_anomaly.i_check_create_anomaly(self) test_pred.i_check_create_dataset(self, suffix='train ') test_pred.i_check_create_dataset(self, suffix='test ') test_batch.i_check_create_batch_anomaly_scores(self) test_anomaly.i_check_create_anomaly_scores(self) test_anomaly.i_check_anomaly_scores(self, example[3])
def test_scenario6(self): """ Scenario: Sucessfully deleting a source in a time range: Given I create a BigML source from file "<data>" storing results in "<output_dir>" And I check that the source has been created And I store the source id as lower And I create a BigML source from file "<data>" storing results in "<output_dir2>" And I check that the source exists And I store the source id as reference And I create a BigML source from file "<data>" storing results in "<output_dir3>" And I check that the source has been created And I store the source id as upper And I delete the source using --older-than and --newer-than storing results in "<output_dir>" Then I check that the reference source doesn't exist Examples: | data | output_dir | output_dir2 | output_dir3 | ../data/iris.csv | ./scenario_del_6 | ./scenario_del_6_2 | ./scenario_del_6_3 """ print self.test_scenario6.__doc__ examples = [["data/iris.csv", "scenario_del_6", "scenario_del_6_2", "scenario_del_6_3"]] for example in examples: print "\nTesting with:\n", example test_delete.i_create_source_from_file(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_delete.i_store_source_id_as_bound(self, which="lower") test_delete.i_create_source_from_file(self, data=example[0], output_dir=example[2]) test_delete.i_check_source_exists(self) test_delete.i_store_source_id_as_bound(self, which="reference") test_delete.i_create_source_from_file(self, data=example[0], output_dir=example[3]) test_pred.i_check_create_source(self) test_delete.i_store_source_id_as_bound(self, which="upper") test_delete.i_delete_source_older_newer(self, output_dir=example[3]) test_delete.i_check_source_does_not_exist(self, source_id=world.source_reference)
def test_scenario2(self): """ Scenario: Successfully building remote test centroid predictions from scratch to dataset: Given I create BigML resources uploading train "<data>" file to find centroids for "<test>" remotely to dataset with no CSV and log resources in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I check that the cluster has been created And I check that the source has been created from the test file And I check that the dataset has been created from the test file And I check that the batch centroid prediction has been created Then I check that the batch centroids dataset exists And no local CSV file is created Examples: | data | test | output_dir | | ../data/grades.csv | ../data/test_grades.csv | ./scenario_cb_2 | """ print self.test_scenario2.__doc__ examples = [[ 'data/grades.csv', 'data/test_grades.csv', 'scenario_cb_2' ]] for example in examples: print "\nTesting with:\n", example test_cluster.i_create_all_cluster_resources_to_dataset( self, data=example[0], test=example[1], output_dir=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_cluster(self) test_pred.i_check_create_test_source(self) test_pred.i_check_create_test_dataset(self) batch_pred.i_check_create_batch_centroid(self) batch_pred.i_check_create_batch_centroids_dataset(self) test_anomaly.i_check_no_local_CSV(self)
def test_scenario2(self): """ Scenario: Successfully building test predictions from scratch: Given I create BigML resources uploading train "<data>" file to test "<test>" remotely with a missing-splits model and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I check that the source has been created from the test file And I check that the dataset has been created from the test file And I check that the batch prediction has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: | data | test | output |predictions_file | | ../data/iris_missing.csv | ../data/test_iris_missing.csv | ./scenario_mspl_2/predictions.csv | ./check_files/predictions_iris_missing.csv """ print self.test_scenario2.__doc__ examples = [[ 'data/iris_missing.csv', 'data/test_iris_missing.csv', 'scenario_mspl_2/predictions.csv', 'check_files/predictions_iris_missing.csv' ]] for example in examples: print "\nTesting with:\n", example test_pred.i_create_all_resources_remote_missing_splits( self, data=example[0], test=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) test_pred.i_check_create_test_source(self) test_pred.i_check_create_test_dataset(self) test_pred.i_check_create_batch_prediction(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[3])
def setup_scenario02(self): """ Scenario: Successfully building test predictions from start: Given I create BigML logistic regression resources uploading train "<data>" file to test "<test>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: | data | test | output |predictions_file | | ../data/iris.csv | ../data/test_iris.csv | ./scenario1_lr/predictions.csv | ./check_files/predictions_iris_lr.csv | """ print self.setup_scenario02.__doc__ examples = [ ['data/iris.csv', 'data/test_iris.csv', 'scenario1_lr/predictions.csv', 'check_files/predictions_iris_lr.csv']] for example in examples: print "\nTesting with:\n", example lr_pred.i_create_all_lr_resources(self, example[0], example[1], example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) lr_pred.i_check_create_lr_model(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[3])
def test_scenario5(self): """ Scenario: Successfully building a filtered dataset from a dataset Given I create a BigML dataset from "<data>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I create a BigML filtered dataset with filter "<filter_exp>" from previous dataset and store logs in "<output_dir>" And I check that the dataset has been created And the number of records in the dataset is <filtered_records> Examples: |data |output_dir | filtered_records | filter_exp |../data/iris.csv | ./scenario_d_5 | 50 | (= (f "000004") "Iris-setosa") """ print self.test_scenario5.__doc__ examples = [[ 'data/iris.csv', 'scenario_d_5', '50', '(= (f "000004") "Iris-setosa")' ]] for example in examples: print "\nTesting with:\n", example dataset_adv.i_create_dataset(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) dataset_adv.i_create_filtered_dataset_from_dataset( self, filter_exp=example[3], output_dir=example[1]) test_pred.i_check_create_dataset(self, suffix='gen ') test_anomaly.i_check_dataset_lines_number(self, example[2])
def test_scenario6(self): """ Scenario: Successfully extending the multi-label source file: Given I create BigML a multi-label source with "<label_separator>" label separator and <number_of_labels> labels from train "<data>" file with "<training_separator>" field separator and "<ml_fields>" as multi-label fields and objective "<objective>" and output in "<output_dir>" And I check that the source has been created Then I check the extended file "<local_file>" has been created And the headers of the local extended file are "<headers>" And the first row of the local extended file is "<first_row>" Examples: |label_separator |number_of_labels | data |training_separator | ml_fields | objective | output_dir |local_file | headers | first_row | |:|7| ../data/multilabel_multi.csv |, | type,class | class | ./scenario_mlm_6 | ./scenario_mlm_6/extended_multilabel_multi.csv |color,year,price,first_name,last_name,sex,class,type,class - Adult,class - Child,class - Pensioner,class - Retired,class - Student,class - Teenager,class - Worker,type - A,type - C,type - P,type - R,type - S,type - T,type - W | Blue,1992,"1208,6988040134",John,Higgins,Male,Worker:Adult,W:A:C:S:T:R:P,1,0,0,0,0,0,1,1,1,1,1,1,1,1 |:|7| ../data/multilabel_multi2.csv |, | Colors,Movies,Hobbies | Hobbies | ./scenario_mlm_7 | ./scenario_mlm_7/extended_multilabel_multi2.csv |Registration Date,Age Range,Gender,Height,Weight,Points,Colors,Movies,Hobbies,Colors - Black,Colors - Blue,Colors - Green,Colors - Grey,Colors - Orange,Colors - Pink,Colors - Purple,Colors - Red,Colors - White,Colors - Yellow,Movies - Action,Movies - Adventure,Movies - Comedy,Movies - Crime,Movies - Erotica,Movies - Fantasy,Movies - Horror,Movies - Mystery,Movies - Philosophical,Movies - Political,Movies - Romance,Movies - Satire,Movies - Thriller,Hobbies - Barbacue,Hobbies - Books,Hobbies - Chat,Hobbies - Cooking,Hobbies - Dance,Hobbies - Disco,Hobbies - Dolls,Hobbies - Family,Hobbies - Films,Hobbies - Fishing,Hobbies - Friends,Hobbies - Jogging,Hobbies - Music,Hobbies - Soccer,Hobbies - Toys,Hobbies - Travel,Hobbies - Videogames,Hobbies - Walking |2011-02-06,19-30,Female,140,47,11,White:Red,Comedy:Romance,Friends:Music,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0 """ print self.test_scenario6.__doc__ examples = [ [':', '7', 'data/multilabel_multi.csv', ',', 'type,class', 'class', 'scenario_mlm_6', 'scenario_mlm_6/extended_multilabel_multi.csv', 'color,year,price,first_name,last_name,sex,class,type,class - Adult,class - Child,class - Pensioner,class - Retired,class - Student,class - Teenager,class - Worker,type - A,type - C,type - P,type - R,type - S,type - T,type - W', 'Blue,1992,"1208,6988040134",John,Higgins,Male,Worker:Adult,W:A:C:S:T:R:P,1,0,0,0,0,0,1,1,1,1,1,1,1,1'], [':', '7', 'data/multilabel_multi2.csv', ',', 'Colors,Movies,Hobbies', 'Hobbies', 'scenario_mlm_7', 'scenario_mlm_7/extended_multilabel_multi2.csv', 'Registration Date,Age Range,Gender,Height,Weight,Points,Colors,Movies,Hobbies,Colors - Black,Colors - Blue,Colors - Green,Colors - Grey,Colors - Orange,Colors - Pink,Colors - Purple,Colors - Red,Colors - White,Colors - Yellow,Movies - Action,Movies - Adventure,Movies - Comedy,Movies - Crime,Movies - Erotica,Movies - Fantasy,Movies - Horror,Movies - Mystery,Movies - Philosophical,Movies - Political,Movies - Romance,Movies - Satire,Movies - Thriller,Hobbies - Barbacue,Hobbies - Books,Hobbies - Chat,Hobbies - Cooking,Hobbies - Dance,Hobbies - Disco,Hobbies - Dolls,Hobbies - Family,Hobbies - Films,Hobbies - Fishing,Hobbies - Friends,Hobbies - Jogging,Hobbies - Music,Hobbies - Soccer,Hobbies - Toys,Hobbies - Travel,Hobbies - Videogames,Hobbies - Walking', '2011-02-06,19-30,Female,140,47,11,White:Red,Comedy:Romance,Friends:Music,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0'] ] for example in examples: print "\nTesting with:\n", example ml_pred.i_create_ml_source(self, label_separator=example[0], number_of_labels=example[1], data=example[2], training_separator=example[3], multi_label_fields=example[4], objective=example[5], output_dir=example[6]) test_pred.i_check_create_source(self) ml_pred.i_check_local_file(self, path=example[7]) ml_pred.i_check_headers_file(self, headers=example[8]) ml_pred.i_check_first_row_file(self, first_row=example[9])
def setup_scenario02(self): """ Scenario: Successfully building test predictions from start: Given I create BigML logistic regression resources uploading train "<data>" file to test "<test>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: | data | test | output |predictions_file | | ../data/iris.csv | ../data/test_iris.csv | ./scenario1_lr/predictions.csv | ./check_files/predictions_iris_lr.csv | """ print self.setup_scenario02.__doc__ examples = [[ 'data/iris.csv', 'data/test_iris.csv', 'scenario1_lr/predictions.csv', 'check_files/predictions_iris_lr.csv' ]] for example in examples: print "\nTesting with:\n", example lr_pred.i_create_all_lr_resources(self, example[0], example[1], example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) lr_pred.i_check_create_lr_model(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[3])
def setup_scenario1(self): """ Scenario: Successfully building multi-label test predictions from start: Given I create BigML multi-label resources tagged as "<tag>" with "<label_separator>" label separator and <number_of_labels> labels uploading train "<data>" file with "<training_separator>" field separator and "<ml_fields>" as multi-label fields using model_fields "<model_fields>" and objective "<objective>" to test "<test>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the models have been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: |tag |label_separator |number_of_labels | data |training_separator | ml_fields | model_fields | objective | test | output |predictions_file | |my_multilabelm_1|:|7| ../data/multilabel_multi.csv |, | type,class | -type,-type - W,-type - A,-type - C,-type - S,-type - R,-type - T,-type - P | class |../data/test_multilabel.csv | ./scenario_mlm_1/predictions.csv | ./check_files/predictions_ml.csv | """ print self.setup_scenario1.__doc__ examples = [ ['my_multilabelm_1', ':', '7', 'data/multilabel_multi.csv', ',', 'type,class', '-type,-type - W,-type - A,-type - C,-type - S,-type - R,-type - T,-type - P', 'class', 'data/test_multilabel.csv', 'scenario_mlm_1/predictions.csv', 'check_files/predictions_ml.csv']] for example in examples: print "\nTesting with:\n", example ml_pred.i_create_all_mlm_resources(self, tag=example[0], label_separator=example[1], number_of_labels=example[2], data=example[3], training_separator=example[4], ml_fields=example[5], model_fields=example[6], objective=example[7], test=example[8], output=example[9]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_check_create_models(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[10])
def test_scenario2(self): """ Scenario: Successfully building predictions for data streamed to stdin: Given I create BigML resources uploading train "<data>" file to test "<test>" read from stdin with name "<name>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: | data | test | output |predictions_file | name | | ../data/iris.csv | ../data/test_iris.csv | ./scenario_st_2/predictions.csv | ./check_files/predictions_iris.csv | Source name: áéí | """ print self.test_scenario2.__doc__ examples = [ ['data/iris.csv', 'data/test_iris.csv', 'scenario_st_2/predictions.csv', 'check_files/predictions_iris.csv', 'Source name: áéí']] for example in examples: print "\nTesting with:\n", example stdin.i_create_all_resources_to_test_from_stdin(self, data=example[0], test=example[1], name=example[4], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[3])
def test_scenario1(self): """ Scenario 1: Successfully building test predictions from scratch: Given I create BigML resources uploading train "<data>" file to test "<test>" remotely with mapping file "<fields_map>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I check that the source has been created from the test file And I check that the dataset has been created from the test file And I check that the batch prediction has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: | data | test | fields_map | output |predictions_file | | ../data/grades.csv | ../data/test_grades.csv | ../data/grades_fields_map.csv | ./scenario_r1_r/predictions.csv | ./check_files/predictions_grades.csv | """ print self.test_scenario1.__doc__ examples = [ ['data/grades.csv', 'data/test_grades.csv', 'data/grades_fields_map.csv', 'scenario_r1_r/predictions.csv', 'check_files/predictions_grades.csv']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_all_resources_batch_map(self, data=example[0], test=example[1], fields_map=example[2], output=example[3]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) test_batch_pred.i_check_create_test_source(self) test_batch_pred.i_check_create_test_dataset(self) test_batch_pred.i_check_create_batch_prediction(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[4])