def test_should_normalize(): dataframe = pd.read_csv('tests/integration/resources/raw_recipes_dump.csv') extracted_features = dataset.extract_features(dataframe) normalize_columns = [ 'calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat' ] actual = dataset.normalize(normalize_columns, extracted_features) expected = pd.DataFrame([[ 137739.0, 51.5 / 269.8, 0.0 / 22.0, 13.0 / 32.0, 0.0, 2.0 / 39.0, 0.0 ], [ 31490.0, 173.4 / 269.8, 18.0 / 22.0, 0.0, 17.0 / 48.0, 22.0 / 39.0, 35.0 / 35.0 ], [ 112140.0, 269.8 / 269.8, 22.0 / 22.0, 1.0, 48.0 / 48.0, 39.0 / 39.0, 27.0 / 35.0 ]], columns=[ 'id', 'normalized_calories', 'normalized_total_fat', 'normalized_sugar', 'normalized_sodium', 'normalized_protein', 'normalized_saturated_fat' ]) pdt.assert_frame_equal(expected, actual)
def test_should_explode_nutrition_column_ignore_other_columns(): data = [[20.0, 'bar', '[1, 2.0, 12.0, 42.0, 3.7, 1337.0, 5.0]']] dataframe = pd.DataFrame(data, columns=['id', 'foo', 'nutrition']) actual = dataset.extract_features(dataframe) expected = pd.DataFrame([[20.0, 1.0, 2.0, 12.0, 42.0, 3.7, 1337.0]], columns=[ 'id', 'calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat' ]) pdt.assert_frame_equal(expected, actual)
def test_should_extract_features_from_file(): dataframe = pd.read_csv('tests/integration/resources/raw_recipes_dump.csv') actual = dataset.extract_features(dataframe) expected = pd.DataFrame([[137739.0, 51.5, 0.0, 13.0, 0.0, 2.0, 0.0], [31490.0, 173.4, 18.0, 0.0, 17.0, 22.0, 35.0], [112140.0, 269.8, 22.0, 32.0, 48.0, 39.0, 27.0]], columns=[ 'id', 'calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat' ]) pdt.assert_frame_equal(expected, actual)
def test_should_explode_nutrition_column(): data = [[20.0, '[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]']] dataframe = pd.DataFrame(data, columns=['id', 'nutrition']) actual = dataset.extract_features(dataframe) expected = pd.DataFrame([[20.0, 269.8, 22.0, 32.0, 48.0, 39.0, 27.0]], columns=[ 'id', 'calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat' ]) print(expected) print(actual) pdt.assert_frame_equal(expected, actual)
#!/usr/bin/env python3 import logging import sys import pandas as pd from recipes_recommendation import dataset LOG = logging.getLogger(__name__) if __name__ == "__main__": dataset_path = str(sys.argv[1]) output_path = str(sys.argv[2]) LOG.info("Starting job: %s", __name__) LOG.info("Loading dataset from: %s", dataset_path) raw_dataframe = pd.read_csv(dataset_path) LOG.info("Done loading dataset") LOG.info("Start extracting features") features_dataframe = dataset.extract_features(raw_dataframe) LOG.info("Done extracting features") LOG.info("Start saving features to: %s", output_path) features_dataframe.to_csv(output_path + 'recipe_features.csv', index=False) LOG.info("Done saving features") LOG.info("Finishing job")