示例#1
0
def test_should_normalize():
    dataframe = pd.read_csv('tests/integration/resources/raw_recipes_dump.csv')
    extracted_features = dataset.extract_features(dataframe)
    normalize_columns = [
        'calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat'
    ]

    actual = dataset.normalize(normalize_columns, extracted_features)
    expected = pd.DataFrame([[
        137739.0, 51.5 / 269.8, 0.0 / 22.0, 13.0 / 32.0, 0.0, 2.0 / 39.0, 0.0
    ],
                             [
                                 31490.0, 173.4 / 269.8, 18.0 / 22.0, 0.0,
                                 17.0 / 48.0, 22.0 / 39.0, 35.0 / 35.0
                             ],
                             [
                                 112140.0, 269.8 / 269.8, 22.0 / 22.0, 1.0,
                                 48.0 / 48.0, 39.0 / 39.0, 27.0 / 35.0
                             ]],
                            columns=[
                                'id', 'normalized_calories',
                                'normalized_total_fat', 'normalized_sugar',
                                'normalized_sodium', 'normalized_protein',
                                'normalized_saturated_fat'
                            ])

    pdt.assert_frame_equal(expected, actual)
def test_should_explode_nutrition_column_ignore_other_columns():
    data = [[20.0, 'bar', '[1, 2.0, 12.0, 42.0, 3.7, 1337.0, 5.0]']]
    dataframe = pd.DataFrame(data, columns=['id', 'foo', 'nutrition'])

    actual = dataset.extract_features(dataframe)
    expected = pd.DataFrame([[20.0, 1.0, 2.0, 12.0, 42.0, 3.7, 1337.0]],
                            columns=[
                                'id', 'calories', 'total_fat', 'sugar',
                                'sodium', 'protein', 'saturated_fat'
                            ])

    pdt.assert_frame_equal(expected, actual)
示例#3
0
def test_should_extract_features_from_file():
    dataframe = pd.read_csv('tests/integration/resources/raw_recipes_dump.csv')

    actual = dataset.extract_features(dataframe)
    expected = pd.DataFrame([[137739.0, 51.5, 0.0, 13.0, 0.0, 2.0, 0.0],
                             [31490.0, 173.4, 18.0, 0.0, 17.0, 22.0, 35.0],
                             [112140.0, 269.8, 22.0, 32.0, 48.0, 39.0, 27.0]],
                            columns=[
                                'id', 'calories', 'total_fat', 'sugar',
                                'sodium', 'protein', 'saturated_fat'
                            ])

    pdt.assert_frame_equal(expected, actual)
def test_should_explode_nutrition_column():
    data = [[20.0, '[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]']]
    dataframe = pd.DataFrame(data, columns=['id', 'nutrition'])

    actual = dataset.extract_features(dataframe)
    expected = pd.DataFrame([[20.0, 269.8, 22.0, 32.0, 48.0, 39.0, 27.0]],
                            columns=[
                                'id', 'calories', 'total_fat', 'sugar',
                                'sodium', 'protein', 'saturated_fat'
                            ])
    print(expected)
    print(actual)

    pdt.assert_frame_equal(expected, actual)
示例#5
0
#!/usr/bin/env python3
import logging
import sys

import pandas as pd

from recipes_recommendation import dataset

LOG = logging.getLogger(__name__)

if __name__ == "__main__":
    dataset_path = str(sys.argv[1])
    output_path = str(sys.argv[2])
    LOG.info("Starting job: %s", __name__)

    LOG.info("Loading dataset from: %s", dataset_path)
    raw_dataframe = pd.read_csv(dataset_path)
    LOG.info("Done loading dataset")

    LOG.info("Start extracting features")
    features_dataframe = dataset.extract_features(raw_dataframe)
    LOG.info("Done extracting features")

    LOG.info("Start saving features to: %s", output_path)
    features_dataframe.to_csv(output_path + 'recipe_features.csv', index=False)
    LOG.info("Done saving features")

    LOG.info("Finishing job")