def test_train_product_classifier_from_embeddings(): runner = CliRunner() output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) embeddings_path = common.get_full_path(tests_path, common.TRANSFORMED_EMBEDDINGS_PATH) # we are going to assume the working directory already has a embeddings model trained params = "from_embeddings=True,label_col=categories,doc_col=document,classes=10,test_size=0.3," \ "lr=0.01,epochs=10,vec_size=300" result = runner.invoke(cli.wtsp, [ '--work-dir', output_path, "train", "products", "--model", "classifier", "--params", params, embeddings_path ]) assert result.exit_code == 0 # validate the existence of the output directory result_dir = f"{output_path}/products/models/classifier" assert os.path.exists(result_dir) # and the content assert os.path.exists(f"{result_dir}/category_encoder.model") assert os.path.exists(f"{result_dir}/prod_classifier-def.yaml") assert os.path.exists(f"{result_dir}/prod_classifier-weights.h5") assert os.path.exists(f"{result_dir}/training_history.png") assert os.path.exists(f"{result_dir}/classification_report.png") common.delete_path(output_path)
def test_describe_tweets_no_filters_should_fail(): runner = CliRunner() input_data = common.get_full_path(tests_path, common.RAW_TWEETS_PATH) output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) result = runner.invoke( cli.describe, ['tweets', "--output-dir", output_path, "--min-count", 10, input_data]) assert result.exit_code != 0 assert "Error: Missing option '-f' / '--filters'" in result.output
def test_transform_no_params_should_fail(): runner = CliRunner() input_data = common.get_full_path(tests_path, common.RAW_TWEETS_PATH) output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) result = runner.invoke(cli.wtsp, [ "--work-dir", output_path, "predict", "clusters", "--filters", "place_name='Los Angeles'", input_data ]) assert result.exit_code != 0 assert "Error: Missing option '-p' / '--params'" in result.output
def test_train_products_no_params_should_fail(): runner = CliRunner() input_data = common.get_full_path(tests_path, common.RAW_PRODUCTS_PATH) output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) result = runner.invoke(cli.wtsp, [ '--work-dir', output_path, "train", "products", "--model", "embeddings", input_data ]) assert result.exit_code != 0 assert "Error: Missing option '-p' / '--params'" in result.output
def test_train_tweets_no_params_should_fail(): runner = CliRunner() input_data = common.get_full_path(tests_path, common.RAW_TWEETS_PATH) output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) result = runner.invoke(cli.train_tweets, [ '--filters', "place_name='Los Angeles'", "--output-dir", output_path, input_data ]) assert result.exit_code != 0 assert "Error: Missing option '-p' / '--params'" in result.output
def test_transform_no_filters_should_fail(): runner = CliRunner() input_data = common.get_full_path(tests_path, common.RAW_TWEETS_PATH) output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) params = "center='34;-118',eps=0.04,n_neighbors=2,location_column=location_geometry,min_score=0.1" result = runner.invoke(cli.wtsp, [ "--work-dir", output_path, "predict", "clusters", "--params", params, input_data ]) assert result.exit_code != 0 assert "Error: Missing option '-f' / '--filters'" in result.output
def test_train_tweets_no_filters_should_fail(): runner = CliRunner() input_data = common.get_full_path(tests_path, common.RAW_TWEETS_PATH) output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) result = runner.invoke(cli.train_tweets, [ '--model', "nearest-neighbors", "--params", "n_neighbors=10,location_column=location_geometry", "--output-dir", output_path, input_data ]) assert result.exit_code != 0 assert "Error: Missing option '-f' / '--filters'" in result.output
def test_train_products_no_model_should_fail(): runner = CliRunner() input_data = common.get_full_path(tests_path, common.RAW_PRODUCTS_PATH) output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) params = "label_col=category,doc_col=document,lr=0.0002,epochs=10,vec_size=300,alpha=0.025,min_alpha=0.00025" result = runner.invoke(cli.wtsp, [ '--work-dir', output_path, "train", "products", "--params", params, input_data ]) assert result.exit_code != 0 assert "Error: Missing option '-m' / '--model'" in result.output
def test_describe_tweets(): runner = CliRunner() input_data = common.get_full_path(tests_path, common.RAW_TWEETS_PATH) output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) result = runner.invoke(cli.describe, [ 'tweets', "--filters", "country_code=US", "--output-dir", output_path, "--min-count", 10, input_data ]) assert result.exit_code == 0 # validate the existence of the output directory tweets_describe_result = f"{output_path}/tweets/country_code=US" assert os.path.exists(tweets_describe_result) # and the content assert os.path.exists(f"{tweets_describe_result}/counts.csv") assert os.path.exists(f"{tweets_describe_result}/bar_chart.png") common.delete_path(tweets_describe_result)
def test_train_product_embeddings(): runner = CliRunner() input_data = common.get_full_path(tests_path, common.RAW_PRODUCTS_PATH) output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) params = "label_col=categories,doc_col=document,lr=0.0002,epochs=10,vec_size=300,alpha=0.025,min_alpha=0.00025,min_count=1" result = runner.invoke(cli.wtsp, [ '--work-dir', output_path, "train", "products", "--model", "embeddings", "--params", params, input_data ]) assert result.exit_code == 0 # validate the existence of the output directory result_dir = f"{output_path}/products/models/embeddings" assert os.path.exists(result_dir) # and the content assert os.path.exists(f"{result_dir}/d2v_model.model") common.delete_path(output_path)
def test_describe_products_with_explode(): runner = CliRunner() input_data = common.get_full_path(tests_path, common.RAW_PRODUCTS_PATH) output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) result = runner.invoke(cli.describe, [ 'products', "--output-dir", output_path, "--groupby", "categories", "--min-count", 10, "--explode", input_data ]) assert result.exit_code == 0 # validate the existence of the output directory products_describe_result = f"{output_path}/documents" assert os.path.exists(products_describe_result) # and the content assert os.path.exists(f"{products_describe_result}/counts.csv") assert os.path.exists(f"{products_describe_result}/bar_chart.png") common.delete_path(products_describe_result)
def test_describe_with_invalid_filter_value(): filters = "lalala=US" describer = Describer("", "place_name", "tweet", "tweets", filters) input_data = common.get_full_path(tests_path, common.RAW_TWEETS_PATH) with pytest.raises(DescribeException) as e: describer.describe(input_data) assert "There is a problem processing the data, see the error message" in str( e.value)
def test_describe_products_no_input_data_should_fail(): runner = CliRunner() output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) result = runner.invoke( cli.describe, ['products', "--output-dir", output_path, "--min-count", 10]) assert result.exit_code != 0 assert "Error: Missing argument 'INPUT_DATA'" in result.output
def test_train_tweets_n_neighbors(): runner = CliRunner() input_data = common.get_full_path(tests_path, common.RAW_TWEETS_PATH) output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) result = runner.invoke(cli.train_tweets, [ '--model', "nearest-neighbors", "--filters", "place_name='Los Angeles'", "--params", "n_neighbors=10,location_column=location_geometry", "--output-dir", output_path, input_data ]) assert result.exit_code == 0 # validate the existence of the output directory result_dir = f"{output_path}/tweets/place_name=Los Angeles" assert os.path.exists(result_dir) # and the content assert os.path.exists(f"{result_dir}/nearest_neighbors.png") assert os.path.exists(f"{result_dir}/scatter_plot.png") common.delete_path(output_path)
def test_train_products_no_input_data_should_fail(): runner = CliRunner() output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) params = "label_col=category,doc_col=document,lr=0.0002,epochs=10,vec_size=300,alpha=0.025,min_alpha=0.00025" result = runner.invoke(cli.wtsp, [ '--work-dir', output_path, "train", "products", "--model", "embeddings", "--params", params ]) assert result.exit_code != 0 assert "Error: Missing argument 'INPUT_DATA'" in result.output
def test_transform_no_input_data_should_fail(): runner = CliRunner() output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) params = "center='34;-118',eps=0.04,n_neighbors=2,location_column=location_geometry,min_score=0.1" result = runner.invoke(cli.wtsp, [ "--work-dir", output_path, "predict", "clusters", "--params", params, "--filters", "place_name='Los Angeles'" ]) assert result.exit_code != 0 assert "Error: Missing argument 'INPUT_DATA'" in result.output
def test_train_tweets_no_input_data_should_fail(): runner = CliRunner() output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) result = runner.invoke(cli.train_tweets, [ '--model', "nearest-neighbors", "--filters", "place_name='Los Angeles'", "--params", "n_neighbors=10,location_column=location_geometry", "--output-dir", output_path ]) assert result.exit_code != 0 assert "Error: Missing argument 'INPUT_DATA'" in result.output
def test_transform_embeddings(): runner = CliRunner() input_path = common.get_full_path(tests_path, common.RAW_PRODUCTS_PATH) output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) models_path = f"{output_path}/products/models/" # we are going to assume the working directory already has a embeddings model trained model_assets_path = common.get_full_path(tests_path, common.ASSETS_PATH) copy_folder_recursively(f"{model_assets_path}/products", models_path) result = runner.invoke( cli.wtsp, ["--work-dir", output_path, "predict", "embeddings", input_path]) assert result.exit_code == 0 # validate the existence of the output files result_embeddings = f"{output_path}/embeddings/document_embeddings.npz" assert os.path.exists(result_embeddings) result_cat_encoder = f"{output_path}/embeddings/category_encoder.save" assert os.path.exists(result_cat_encoder) common.delete_path(output_path)
def test_transform_where_to_sell_products(): runner = CliRunner() input_data = common.get_full_path(tests_path, common.RAW_TWEETS_PATH) output_path = common.get_full_path(tests_path, common.TEST_WORK_DIR_PATH) models_path = f"{output_path}/products/models/" # we are going to assume the working directory already has a embeddings model trained model_assets_path = common.get_full_path(tests_path, common.ASSETS_PATH) copy_folder_recursively(f"{model_assets_path}/products", models_path) params = "center='34;-118',eps=0.04,n_neighbors=2,location_column=location_geometry,min_score=0.1" result = runner.invoke(cli.wtsp, [ "--work-dir", output_path, "predict", "clusters", "--filters", "place_name='Los Angeles'", "--params", params, input_data ]) assert result.exit_code == 0 # validate the existence of the output directory result_dir = f"{output_path}/where_to_sell_in/place_name=Los Angeles" assert os.path.exists(result_dir) # and the content assert os.path.exists(f"{result_dir}/classified_clusters.csv") assert os.path.exists(f"{result_dir}/classified_clusters.html") common.delete_path(output_path)