예제 #1
0
def test_rename(pipeline_executor):
    df, _ = pipeline_executor(
        Pipeline(steps=[
            {
                'name': 'domain',
                'domain': 'domain_a'
            },
            {
                'name': 'rename',
                'toRename': [['colA', 'col_a'], ['colB', 'col_b']]
            },
        ]))

    assert_dataframes_equals(
        df,
        pd.DataFrame({
            'col_a': ['toto', 'tutu', 'tata'],
            'col_b': [1, 2, 3],
            'colC': [100, 50, 25]
        }),
    )
예제 #2
0
def test_simple_condition_strings():
    sample_df = DataFrame({'a_str': ["test", "test", "autre chose"]})
    result_df = IfthenelseStep(
        **{
            'name': 'ifthenelse',
            'newColumn': 'test',
            'if': {
                'column': 'a_str',
                'value': "test",
                'operator': 'eq'
            },
            'then': "\"foo\"",
            'else': "\"bar\"",
        }).execute(sample_df)

    expected_df = DataFrame({
        'a_str': ["test", "test", "autre chose"],
        'test': ["foo", "foo", "bar"]
    })

    assert_dataframes_equals(result_df, expected_df)
예제 #3
0
def test_concatenate():
    sample_df = DataFrame({'NAME': ['foo', 'bar'], 'AGE': [42, 43], 'SCORE': [100, 200]})

    step = ConcatenateStep(
        name='concatenate',
        columns=['NAME', 'AGE', 'SCORE'],
        separator=' - ',
        new_column_name='newcol',
    )

    df_result = execute_concatenate(step, sample_df)

    expected_result = DataFrame(
        {
            'NAME': ['foo', 'bar'],
            'AGE': [42, 43],
            'SCORE': [100, 200],
            'newcol': ['foo - 42 - 100', 'bar - 43 - 200'],
        }
    )
    assert_dataframes_equals(df_result, expected_result)
예제 #4
0
def test_isnull():
    df = DataFrame({'a_bool': [True, False, None]})
    step = IfthenelseStep(
        **{
            "name": "ifthenelse",
            "if": {
                "column": "a_bool",
                "operator": "isnull",
                "value": None
            },
            "newColumn": "test",
            "then": "1",
            "else": "0",
        })

    result = step.execute(df)
    assert_dataframes_equals(
        result, DataFrame({
            'a_bool': [True, False, None],
            'test': [0, 0, 1]
        }))
예제 #5
0
def test_rollup(sample_df: DataFrame):
    df_result = RollupStep(
        name='rollup',
        hierarchy=['CONTINENT', 'COUNTRY', 'CITY'],
        aggregations=[
            {
                'newcolumns': ['VALUE'],
                'aggfunction': 'sum',
                'columns': ['VALUE']
            },
        ],
    ).execute(sample_df)

    columns = [
        'CITY', 'COUNTRY', 'CONTINENT', 'label', 'level', 'parent', 'VALUE'
    ]
    expected_data = [
        [None, None, 'Europe', 'Europe', 'CONTINENT', None, 64],
        [None, None, 'North America', 'North America', 'CONTINENT', None, 112],
        [None, 'France', 'Europe', 'France', 'COUNTRY', 'Europe', 36],
        [None, 'Spain', 'Europe', 'Spain', 'COUNTRY', 'Europe', 28],
        [
            None, 'Canada', 'North America', 'Canada', 'COUNTRY',
            'North America', 40
        ],
        [None, 'USA', 'North America', 'USA', 'COUNTRY', 'North America', 72],
        ['Bordeaux', 'France', 'Europe', 'Bordeaux', 'CITY', 'France', 13],
        ['Paris', 'France', 'Europe', 'Paris', 'CITY', 'France', 23],
        ['Barcelona', 'Spain', 'Europe', 'Barcelona', 'CITY', 'Spain', 19],
        ['Madrid', 'Spain', 'Europe', 'Madrid', 'CITY', 'Spain', 9],
        [
            'Montreal', 'Canada', 'North America', 'Montreal', 'CITY',
            'Canada', 20
        ],
        ['Ottawa', 'Canada', 'North America', 'Ottawa', 'CITY', 'Canada', 20],
        ['Boston', 'USA', 'North America', 'Boston', 'CITY', 'USA', 27],
        ['New-York', 'USA', 'North America', 'New-York', 'CITY', 'USA', 45],
    ]
    expected_result = DataFrame(expected_data, columns=columns)
    assert_dataframes_equals(df_result, expected_result)
예제 #6
0
def test_unpivot_with_dropna_false(sample_df: DataFrame):
    step = UnpivotStep(
        name='unpivot',
        keep=['COMPANY', 'COUNTRY'],
        unpivot=['NB_CLIENTS', 'REVENUES'],
        unpivot_column_name='KPI',
        value_column_name='VALUE',
        dropna=False,
    )
    result = execute_unpivot(step,
                             sample_df,
                             domain_retriever=None,
                             execute_pipeline=None)
    expected_result = DataFrame({
        'COMPANY': ['Company 1'] * 2 + ['Company 2'] * 2 + ['Company 1'] * 2 +
        ['Company 2'] * 2,
        'COUNTRY': ['France'] * 4 + ['USA'] * 4,
        'KPI': ['NB_CLIENTS', 'REVENUES'] * 4,
        'VALUE': [7, 10, 2, None, 12, 6, 1, 3],
    })
    assert_dataframes_equals(result.sort_values(['COUNTRY', 'COMPANY', 'KPI']),
                             expected_result)
예제 #7
0
def test_append_with_domain_name(
    sample_df: DataFrame,
    mock_domain_retriever: DomainRetriever,
    mock_execute_pipeline: PipelineExecutor,
):
    """
    It should accept a domain name instead of a complete pipeline
    """
    df_result = AppendStep(
        name='append',
        pipelines=['miam'],
    ).execute(sample_df,
              domain_retriever=mock_domain_retriever,
              execute_pipeline=mock_execute_pipeline)

    expected_result = DataFrame({
        'name': ['foo', 'bar', 'miam'],
        'age': [42, 43, None],
        'score': [100, 200, 999],
        'lambda': [None, None, 'p'],
    })
    assert_dataframes_equals(df_result, expected_result)
예제 #8
0
def test_missing_date(today):
    dates = [today + timedelta(days=nb_day) for nb_day in list(range(1, 10)) + list(range(12, 20))]
    missing_dates = [today + timedelta(days=10), today + timedelta(days=11)]

    values = [idx for (idx, value) in enumerate(dates)]
    df = pd.DataFrame(
        {
            'date': dates,
            'value': values,
        }
    )

    step = AddMissingDatesStep(
        name='addmissingdates', datesColumn='date', datesGranularity='day', groups=[]
    )

    result = step.execute(df)
    expected_result = pd.concat(
        [df, pd.DataFrame({'date': missing_dates, 'value': [None, None]})]
    ).sort_values(by='date')

    assert_dataframes_equals(result, expected_result)
예제 #9
0
def test_with_original_granularity(sample_df):
    df_result = AggregateStep(
        name='aggregate',
        keepOriginalGranularity=True,
        on=['Group'],
        aggregations=[
            Aggregation(aggfunction='sum', columns=['Value1'], newcolumns=['Total']),
        ],
    ).execute(sample_df)

    assert_dataframes_equals(
        df_result,
        DataFrame(
            {
                'Label': ['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5', 'Label 6'],
                'Group': ['Group 1'] * 3 + ['Group 2'] * 3,
                'Value1': [13, 7, 20, 1, 10, 5],
                'Total': [40] * 3 + [16] * 3,
                'Value2': [10, 21, 4, 17, 12, 2],
            }
        ),
    )
예제 #10
0
def test_keep_less_columns(sample_df):
    result_df = SplitStep(name='split',
                          column='Label',
                          delimiter='-',
                          number_cols_to_keep=2).execute(sample_df)
    expected_df = pd.DataFrame({
        'Label': [
            'Label 1 - Groupe 1 - France',
            'Label 2 - Groupe 1 - Spain',
            'Label 3 - Groupe 1 - USA',
            'Label 4 - Groupe 2 - France',
            'Label 5 - Groupe 2 - Spain',
            'Label 6 - Groupe 2 - USA',
        ],
        'Label_1': [
            'Label 1 ', 'Label 2 ', 'Label 3 ', 'Label 4 ', 'Label 5 ',
            'Label 6 '
        ],
        'Label_2': [' Groupe 1 '] * 3 + [' Groupe 2 '] * 3,
        'Values': [13, 7, 20, 1, 10, 5],
    })
    assert_dataframes_equals(result_df, expected_df)
예제 #11
0
def test_then_should_support_formulas():
    base_df = DataFrame({'a_bool': [True, False, True], 'a_number': [1, 2, 3]})
    result_df = IfthenelseStep(
        **{
            'name': 'ifthenelse',
            'newColumn': 'result',
            'if': {
                'column': 'a_bool',
                'value': True,
                'operator': 'eq'
            },
            'then': 'a_number',
            'else': 'a_number * -1',
        }).execute(base_df)

    expected_df = DataFrame({
        'a_bool': [True, False, True],
        'a_number': [1, 2, 3],
        'result': [1, -2, 3]
    })

    assert_dataframes_equals(result_df, expected_df)
예제 #12
0
def test_duration(time_delta_parameters: Dict[str, int], duration_in: str,
                  expected_result: float):
    step = DurationStep(
        name='duration',
        newColumnName='DURATION',
        startDateColumn='START_DATE',
        endDateColumn='END_DATE',
        durationIn=duration_in,
    )

    now = datetime.now()
    delta = timedelta(**time_delta_parameters)
    sample_df = pd.DataFrame({'START_DATE': [now], 'END_DATE': [now + delta]})

    result_df = execute_duration(step, sample_df)

    expected_result = pd.DataFrame({
        'START_DATE': [now],
        'END_DATE': [now + delta],
        'DURATION': [expected_result],
    })

    assert_dataframes_equals(result_df, expected_result)
예제 #13
0
def test_or_logical_conditions(sample_df):
    step = FilterStep(
        name='filter',
        condition={
            'or': [
                {
                    'column': 'colA',
                    'operator': 'eq',
                    'value': 'toto',
                },
                {
                    'column': 'colC',
                    'operator': 'lt',
                    'value': 33,
                },
            ]
        },
    )
    df_result = execute_filter(step, sample_df)

    assert_dataframes_equals(
        df_result, DataFrame({'colA': ['toto', 'tata'], 'colB': [1, 3], 'colC': [100, 25]})
    )
예제 #14
0
def test_missing_date_with_groups_various_length(today):
    dates = [
        datetime.datetime(year=2020, month=nb_month, day=1)
        for nb_month in list(range(1, 5)) + list(range(8, 10))
    ]

    missing_dates = [datetime.datetime(year=2020, month=nb_month, day=1) for nb_month in [5, 6, 7]]

    values = [idx for (idx, value) in enumerate(dates)]
    df = pd.DataFrame(
        {
            'date': dates + dates[0:-1],
            'country': ['France'] * len(dates) + ['USA'] * (len(dates) - 1),
            'value': values + values[0:-1],
        }
    )

    step = AddMissingDatesStep(
        name='addmissingdates', datesColumn='date', datesGranularity='month', groups=['country']
    )
    result = step.execute(df)
    expected_result = pd.concat(
        [
            df,
            pd.DataFrame(
                {
                    'country': cast(
                        List[Optional[Any]],
                        ['France'] * len(missing_dates) + ['USA'] * len(missing_dates),
                    ),
                    'date': missing_dates * 2,
                    'value': [None] * len(missing_dates) * 2,
                }
            ),
        ]
    ).sort_values(by=['country', 'date'])
    assert_dataframes_equals(result, expected_result)
예제 #15
0
def test_join_left(
    sample_df: DataFrame,
    mock_domain_retriever: DomainRetriever,
    mock_execute_pipeline: PipelineExecutor,
):
    step = JoinStep(
        name='join',
        right_pipeline=[{'name': 'domain', 'domain': 'buzz'}],
        on=[
            ['NAME', 'name'],
        ],
        type='left',
    )
    df_result = execute_join(
        step,
        sample_df,
        domain_retriever=mock_domain_retriever,
        execute_pipeline=mock_execute_pipeline,
    )

    expected_result = DataFrame(
        {'NAME': ['foo', 'bar'], 'name': [None, 'bar'], 'AGE': [42, 43], 'score': [None, 100]}
    )
    assert_dataframes_equals(df_result, expected_result)
예제 #16
0
def test_cumsum_with_groups():
    sample_df = DataFrame({
        'date':
        ['2019-01', '2019-02', '2019-03', '2019-04', '2019-05', '2019-06'] * 2,
        'country': ['France'] * 6 + ['USA'] * 6,
        'value': [2, 5, 3, 8, 9, 6] + [10, 6, 6, 4, 8, 7],
    })

    df_result = CumSumStep(
        name='cumsum',
        valueColumn='value',
        referenceColumn='date',
        groupby=['country'],
        newColumn='my_cumsum',
    ).execute(sample_df)

    expected_result = DataFrame({
        'date':
        ['2019-01', '2019-02', '2019-03', '2019-04', '2019-05', '2019-06'] * 2,
        'country': ['France'] * 6 + ['USA'] * 6,
        'value': [2, 5, 3, 8, 9, 6] + [10, 6, 6, 4, 8, 7],
        'my_cumsum': [2, 7, 10, 18, 27, 33] + [10, 16, 22, 26, 34, 41],
    })
    assert_dataframes_equals(df_result, expected_result.sort_values('date'))
예제 #17
0
def test_simple_with_aggregation():
    sample_df = pd.DataFrame({
        'city': ['Bordeaux', 'Boston', 'New-York', 'Paris', 'Paris'] * 2,
        'year': [2019] * 5 + [2018] * 5,
        'revenue': [135, 275, 115, 450, 10, 98, 245, 103, 385, 10],
    })
    result_df = WaterfallStep(
        name='waterfall',
        valueColumn='revenue',
        milestonesColumn='year',
        start=2018,
        end=2019,
        labelsColumn='city',
        sortBy='value',
        order='desc',
    ).execute(sample_df)

    expected_df = pd.DataFrame({
        'LABEL_waterfall':
        ['2018', 'Paris', 'Bordeaux', 'Boston', 'New-York', '2019'],
        'TYPE_waterfall': [None, 'Parent', 'Parent', 'Parent', 'Parent', None],
        'revenue': [841, 65, 37, 30, 12, 985],
    })
    assert_dataframes_equals(result_df, expected_df)
예제 #18
0
def test_moving_average_with_groups():
    df = DataFrame({
        'country': ['France'] * 6 + ['USA'] * 6,
        'date': [f'2018-01-0{i}' for i in range(1, 7)] * 2,
        'value': [75, 80, 82, 83, 80, 86] + [69, 73, 73, 75, 70, 76],
    })
    df['date'] = pd.to_datetime(df['date'])

    step = MovingAverageStep(
        name='movingaverage',
        valueColumn='value',
        columnToSort='date',
        movingWindow=3,
        groups=['country'],
        newColumnName='rolling_average',
    )
    df_result = execute_moving_average(step, df)

    expected_result = df.assign(
        **{
            'rolling_average': [None, None, 79, 81.6667, 81.6667, 83] +
            [None, None, 71.6667, 73.6667, 72.6667, 73.6667]
        })
    assert_dataframes_equals(df_result, expected_result)
예제 #19
0
def test_waterfall_bug_drill():
    """
    Tuple (label, parent) should be unique only among one "group by" sub-df.
    """
    base_df = pd.DataFrame({
        'grand parent': ['Food', 'Vegetarian', 'Fruits'] * 2,
        'parent': ['Vegetarian', 'Fruits', 'Berries'] * 2,
        'label': ['Fruits', 'Berries', 'Blueberries'] * 2,
        'variable': ['A'] * 3 + ['B'] * 3,
        'value': [1, 2, 3, 11, 12, 13],
    })
    step = WaterfallStep(
        name='waterfall',
        valueColumn='value',
        milestonesColumn='variable',
        start='A',
        end='B',
        labelsColumn='label',
        parentsColumn='parent',
        groupby=['grand parent'],
        sortBy='label',
        order='asc',
    )
    result = execute_waterfall(step, base_df)
    assert_dataframes_equals(
        result,
        pd.DataFrame({
            'grand parent': [
                'Food',
                'Vegetarian',
                'Fruits',
                'Vegetarian',
                'Fruits',
                'Fruits',
                'Food',
                'Vegetarian',
                'Food',
                'Food',
                'Vegetarian',
                'Fruits',
            ],
            'LABEL_waterfall': ['A'] * 3 + [
                'Berries', 'Berries', 'Blueberries', 'Fruits', 'Fruits',
                'Vegetarian'
            ] + ['B'] * 3,
            'value': [1, 2, 3] + [10] * 6 + [11, 12, 13],
            'GROUP_waterfall': ['A'] * 3 + [
                'Fruits', 'Berries', 'Berries', 'Vegetarian', 'Fruits',
                'Vegetarian'
            ] + ['B'] * 3,
            'TYPE_waterfall': [
                None,
                None,
                None,
                'child',
                'parent',
                'child',
                'child',
                'parent',
                'parent',
                None,
                None,
                None,
            ],
        }),
    )
예제 #20
0
def test_complex_rollup(sample_df: DataFrame):
    sample_df = sample_df.assign(COUNT=1)
    step = RollupStep(
        name='rollup',
        hierarchy=['CONTINENT', 'COUNTRY', 'CITY'],
        aggregations=[
            {
                'newcolumns': ['VALUE-sum', 'COUNT'],
                'aggfunction': 'sum',
                'columns': ['VALUE', 'COUNT'],
            },
            {'newcolumns': ['VALUE-avg'], 'aggfunction': 'avg', 'columns': ['VALUE']},
        ],
        groupby=['YEAR'],
        labelCol='MY_LABEL',
        levelCol='MY_LEVEL',
        parentLabelCol='MY_PARENT',
    )
    df_result = execute_rollup(step, sample_df)

    columns = [
        'CITY',
        'COUNTRY',
        'CONTINENT',
        'YEAR',
        'MY_LABEL',
        'MY_LEVEL',
        'MY_PARENT',
        'VALUE-sum',
        'VALUE-avg',
        'COUNT',
    ]
    expected_data = [
        [None, None, 'Europe', 2018, 'Europe', 'CONTINENT', None, 26, 6.5, 4],
        [None, None, 'North America', 2018, 'North America', 'CONTINENT', None, 50, 12.5, 4],
        [None, None, 'Europe', 2019, 'Europe', 'CONTINENT', None, 38, 9.5, 4],
        [None, None, 'North America', 2019, 'North America', 'CONTINENT', None, 62, 15.5, 4],
        [None, 'France', 'Europe', 2018, 'France', 'COUNTRY', 'Europe', 15, 7.5, 2],
        [None, 'Spain', 'Europe', 2018, 'Spain', 'COUNTRY', 'Europe', 11, 5.5, 2],
        [None, 'Canada', 'North America', 2018, 'Canada', 'COUNTRY', 'North America', 17, 8.5, 2],
        [None, 'USA', 'North America', 2018, 'USA', 'COUNTRY', 'North America', 33, 16.5, 2],
        [None, 'France', 'Europe', 2019, 'France', 'COUNTRY', 'Europe', 21, 10.5, 2],
        [None, 'Spain', 'Europe', 2019, 'Spain', 'COUNTRY', 'Europe', 17, 8.5, 2],
        [None, 'Canada', 'North America', 2019, 'Canada', 'COUNTRY', 'North America', 23, 11.5, 2],
        [None, 'USA', 'North America', 2019, 'USA', 'COUNTRY', 'North America', 39, 19.5, 2],
        ['Bordeaux', 'France', 'Europe', 2018, 'Bordeaux', 'CITY', 'France', 5, 5, 1],
        ['Paris', 'France', 'Europe', 2018, 'Paris', 'CITY', 'France', 10, 10, 1],
        ['Barcelona', 'Spain', 'Europe', 2018, 'Barcelona', 'CITY', 'Spain', 8, 8, 1],
        ['Madrid', 'Spain', 'Europe', 2018, 'Madrid', 'CITY', 'Spain', 3, 3, 1],
        ['Montreal', 'Canada', 'North America', 2018, 'Montreal', 'CITY', 'Canada', 10, 10, 1],
        ['Ottawa', 'Canada', 'North America', 2018, 'Ottawa', 'CITY', 'Canada', 7, 7, 1],
        ['Boston', 'USA', 'North America', 2018, 'Boston', 'CITY', 'USA', 12, 12, 1],
        ['New-York', 'USA', 'North America', 2018, 'New-York', 'CITY', 'USA', 21, 21, 1],
        ['Bordeaux', 'France', 'Europe', 2019, 'Bordeaux', 'CITY', 'France', 8, 8, 1],
        ['Paris', 'France', 'Europe', 2019, 'Paris', 'CITY', 'France', 13, 13, 1],
        ['Barcelona', 'Spain', 'Europe', 2019, 'Barcelona', 'CITY', 'Spain', 11, 11, 1],
        ['Madrid', 'Spain', 'Europe', 2019, 'Madrid', 'CITY', 'Spain', 6, 6, 1],
        ['Montreal', 'Canada', 'North America', 2019, 'Montreal', 'CITY', 'Canada', 10, 10, 1],
        ['Ottawa', 'Canada', 'North America', 2019, 'Ottawa', 'CITY', 'Canada', 13, 13, 1],
        ['Boston', 'USA', 'North America', 2019, 'Boston', 'CITY', 'USA', 15, 15, 1],
        ['New-York', 'USA', 'North America', 2019, 'New-York', 'CITY', 'USA', 24, 24, 1],
    ]
    expected_result = DataFrame(expected_data, columns=columns)
    assert_dataframes_equals(df_result, expected_result)
예제 #21
0
def test_date_extract_(sample_df: DataFrame):
    step = DateExtractStep(
        name='dateextract',
        column='date',
        dateInfo=[
            'year',
            'month',
            'day',
            'week',
            'quarter',
            'dayOfWeek',
            'dayOfYear',
            'isoYear',
            'isoWeek',
            'isoDayOfWeek',
            'firstDayOfYear',
            'firstDayOfMonth',
            'firstDayOfWeek',
            'firstDayOfQuarter',
            'firstDayOfIsoWeek',
            'previousDay',
            'firstDayOfPreviousYear',
            'firstDayOfPreviousMonth',
            'firstDayOfPreviousWeek',
            'firstDayOfPreviousQuarter',
            'firstDayOfPreviousIsoWeek',
            'previousYear',
            'previousMonth',
            'previousWeek',
            'previousQuarter',
            'previousIsoWeek',
            'hour',
            'minutes',
            'seconds',
            'milliseconds',
        ],
        newColumns=[
            'date_year',
            'date_month',
            'date_day',
            'date_week',
            'date_quarter',
            'date_dayOfWeek',
            'date_dayOfYear',
            'date_isoYear',
            'date_isoWeek',
            'date_isoDayOfWeek',
            'date_firstDayOfYear',
            'date_firstDayOfMonth',
            'date_firstDayOfWeek',
            'date_firstDayOfQuarter',
            'date_firstDayOfIsoWeek',
            'date_previousDay',
            'date_firstDayOfPreviousYear',
            'date_firstDayOfPreviousMonth',
            'date_firstDayOfPreviousWeek',
            'date_firstDayOfPreviousQuarter',
            'date_firstDayOfPreviousIsoWeek',
            'date_previousYear',
            'date_previousMonth',
            'date_previousWeek',
            'date_previousQuarter',
            'date_previousIsoWeek',
            'date_hour',
            'date_minutes',
            'date_seconds',
            'date_milliseconds',
        ],
    )
    df_result = execute_date_extract(step, sample_df)
    expected_result = DataFrame(
        {
            'date': to_datetime(
                [
                    '2021-03-29T00:00:00.000Z',
                    '2020-12-13T00:00:00.000Z',
                    '2020-07-29T00:00:00.000Z',
                    '2019-04-09T01:02:03.004Z',
                    '2017-01-02T00:00:00.000Z',
                    '2016-01-01T00:00:00.000Z',
                    None,
                ]
            ),
            'date_year': [2021, 2020, 2020, 2019, 2017, 2016, None],
            'date_month': [3, 12, 7, 4, 1, 1, None],
            'date_day': [29, 13, 29, 9, 2, 1, None],
            'date_week': [13, 50, 30, 14, 1, 0, None],
            'date_quarter': [1, 4, 3, 2, 1, 1, None],
            'date_dayOfWeek': [2, 1, 4, 3, 2, 6, None],
            'date_dayOfYear': [88, 348, 211, 99, 2, 1, None],
            'date_isoYear': [2021, 2020, 2020, 2019, 2017, 2015, None],
            'date_isoWeek': [13, 50, 31, 15, 1, 53, None],
            'date_isoDayOfWeek': [1, 7, 3, 2, 1, 5, None],
            'date_firstDayOfYear': to_datetime(
                [
                    "2021-01-01T00:00:00.000Z",
                    "2020-01-01T00:00:00.000Z",
                    "2020-01-01T00:00:00.000Z",
                    "2019-01-01T00:00:00.000Z",
                    "2017-01-01T00:00:00.000Z",
                    "2016-01-01T00:00:00.000Z",
                    None,
                ]
            ),
            'date_firstDayOfMonth': to_datetime(
                [
                    "2021-03-01T00:00:00.000Z",
                    "2020-12-01T00:00:00.000Z",
                    "2020-07-01T00:00:00.000Z",
                    "2019-04-01T00:00:00.000Z",
                    "2017-01-01T00:00:00.000Z",
                    "2016-01-01T00:00:00.000Z",
                    None,
                ]
            ),
            'date_firstDayOfWeek': to_datetime(
                [
                    "2021-03-28T00:00:00.000Z",
                    "2020-12-13T00:00:00.000Z",
                    "2020-07-26T00:00:00.000Z",
                    "2019-04-07T00:00:00.000Z",
                    "2017-01-01T00:00:00.000Z",
                    "2015-12-27T00:00:00.000Z",
                    None,
                ]
            ),
            'date_firstDayOfQuarter': to_datetime(
                [
                    "2021-01-01T00:00:00.000Z",
                    "2020-10-01T00:00:00.000Z",
                    "2020-07-01T00:00:00.000Z",
                    "2019-04-01T00:00:00.000Z",
                    "2017-01-01T00:00:00.000Z",
                    "2016-01-01T00:00:00.000Z",
                    None,
                ]
            ),
            'date_firstDayOfIsoWeek': to_datetime(
                [
                    "2021-03-29T00:00:00.000Z",
                    "2020-12-07T00:00:00.000Z",
                    "2020-07-27T00:00:00.000Z",
                    "2019-04-08T00:00:00.000Z",
                    "2017-01-02T00:00:00.000Z",
                    "2015-12-28T00:00:00.000Z",
                    None,
                ]
            ),
            'date_previousDay': to_datetime(
                [
                    "2021-03-28T00:00:00.000Z",
                    "2020-12-12T00:00:00.000Z",
                    "2020-07-28T00:00:00.000Z",
                    "2019-04-08T00:00:00.000Z",
                    "2017-01-01T00:00:00.000Z",
                    "2015-12-31T00:00:00.000Z",
                    None,
                ]
            ),
            'date_firstDayOfPreviousYear': to_datetime(
                [
                    "2020-01-01T00:00:00.000Z",
                    "2019-01-01T00:00:00.000Z",
                    "2019-01-01T00:00:00.000Z",
                    "2018-01-01T00:00:00.000Z",
                    "2016-01-01T00:00:00.000Z",
                    "2015-01-01T00:00:00.000Z",
                    None,
                ]
            ),
            'date_firstDayOfPreviousMonth': to_datetime(
                [
                    "2021-02-01T00:00:00.000Z",
                    "2020-11-01T00:00:00.000Z",
                    "2020-06-01T00:00:00.000Z",
                    "2019-03-01T00:00:00.000Z",
                    "2016-12-01T00:00:00.000Z",
                    "2015-12-01T00:00:00.000Z",
                    None,
                ]
            ),
            'date_firstDayOfPreviousWeek': to_datetime(
                [
                    "2021-03-21T00:00:00.000Z",
                    "2020-12-06T00:00:00.000Z",
                    "2020-07-19T00:00:00.000Z",
                    "2019-03-31T00:00:00.000Z",
                    "2016-12-25T00:00:00.000Z",
                    "2015-12-20T00:00:00.000Z",
                    None,
                ]
            ),
            'date_firstDayOfPreviousQuarter': to_datetime(
                [
                    "2020-10-01T00:00:00.000Z",
                    "2020-07-01T00:00:00.000Z",
                    "2020-04-01T00:00:00.000Z",
                    "2019-01-01T00:00:00.000Z",
                    "2016-10-01T00:00:00.000Z",
                    "2015-10-01T00:00:00.000Z",
                    None,
                ]
            ),
            'date_firstDayOfPreviousIsoWeek': to_datetime(
                [
                    "2021-03-22T00:00:00.000Z",
                    "2020-11-30T00:00:00.000Z",
                    "2020-07-20T00:00:00.000Z",
                    "2019-04-01T00:00:00.000Z",
                    "2016-12-26T00:00:00.000Z",
                    "2015-12-21T00:00:00.000Z",
                    None,
                ]
            ),
            'date_previousYear': [2020, 2019, 2019, 2018, 2016, 2015, None],
            'date_previousMonth': [2, 11, 6, 3, 12, 12, None],
            'date_previousQuarter': [4, 3, 2, 1, 4, 4, None],
            'date_previousWeek': [12, 49, 29, 13, 52, 51, None],
            'date_previousIsoWeek': [12, 49, 30, 14, 52, 52, None],
            'date_hour': [0, 0, 0, 1, 0, 0, None],
            'date_minutes': [0, 0, 0, 2, 0, 0, None],
            'date_seconds': [0, 0, 0, 3, 0, 0, None],
            'date_milliseconds': [0, 0, 0, 4, 0, 0, None],
        }
    )
    assert_dataframes_equals(df_result, expected_result)

    # Ensure there are no unsigned int types in result:
    assert UInt32Dtype() not in list(df_result.dtypes)
예제 #22
0
def test_duplicate():
    input_df = DataFrame({'x': [100, 200]})
    step = DuplicateStep(name='duplicate', column='x', new_column_name='y')
    df_result = execute_duplicate(step, input_df)
    expected_result = DataFrame({'x': [100, 200], 'y': [100, 200]})
    assert_dataframes_equals(df_result, expected_result)
예제 #23
0
def test_convert_to_text(sample_df: DataFrame):
    step = ConvertStep(name='convert', columns=['value'], data_type='text')
    df_result = execute_convert(step, sample_df)

    expected_result = DataFrame({'value': ['41', '42', '43.5', '43.6', 'None', 'meh']})
    assert_dataframes_equals(df_result, expected_result)
예제 #24
0
def test_convert_to_integer(sample_df: DataFrame):
    step = ConvertStep(name='convert', columns=['value'], data_type='integer')
    df_result = execute_convert(step, sample_df)

    expected_result = DataFrame({'value': [41, 42, 43, 43, None, None]})
    assert_dataframes_equals(df_result, expected_result)
예제 #25
0
def test_convert_to_float(sample_df: DataFrame):
    step = ConvertStep(name='convert', columns=['value'], data_type='float')
    df_result = execute_convert(step, sample_df)

    expected_result = DataFrame({'value': [41.0, 42.0, 43.5, 43.6, None, None]})
    assert_dataframes_equals(df_result, expected_result)