def test_process_parquet_add_twice(setup_queue_event, create_table_mock,
                                   test_data1, test_data2):
    event1 = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data1))

    process_handler(event1, None)

    event2 = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data2))

    process_handler(event2, None)

    create_table_mock.assert_table_data_column(
        'google_sheets_metadata', 'uploaded_by_user',
        pd.Series(['[email protected]', '[email protected]']))
def test_insert_data_quiz_process(setup_queue_event, test_data_quiz,
                                  create_table_mock):
    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0),
                    data=test_data_quiz))

    processHandler(event, None)

    create_table_mock.assert_table_data_column(
        'google_forms_data', 'type',
        pd.Series([
            'SCALE', 'MULTIPLE_CHOICE', 'TEXT', 'CHECKBOX', 'MULTIPLE_CHOICE',
            'LIST', 'DATE', 'TIME'
        ]))

    create_table_mock.assert_table_data_column(
        'google_forms_data', 'form_name',
        pd.Series([
            'test_quiz', 'test_quiz', 'test_quiz', 'test_quiz', 'test_quiz',
            'test_quiz', 'test_quiz', 'test_quiz'
        ]))

    create_table_mock.assert_table_data_column(
        'google_forms_data', 'uploaded_by_user',
        pd.Series([
            'test_person', 'test_person', 'test_person', 'test_person',
            'test_person', 'test_person', 'test_person', 'test_person'
        ]))

    create_table_mock.assert_table_data_column(
        'google_forms_data', 'is_quiz',
        pd.Series([True, True, True, True, True, True, True, True]))
示例#3
0
def test_s3_put_data(s3_bucket):
    s3 = aws.S3()
    key = s3.put(
        schema.Data(metadata=schema.Metadata(timestamp=0), data='test'))

    res = schema.Data.from_json(s3_bucket.Object(key).get()['Body'].read())
    assert res.data == 'test'
def test_handler_call_process_overwrite_all_versions_empty_historical_data(
        s3_bucket, setup_queue_event):
    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0), data=''))
    process_handler = handler.ProcessHandler()

    @process_handler.process(partitions={},
                             overwrite=False,
                             overwrite_all_versions=True,
                             historical_tables=[])
    def test_process(data, events):
        return {'test': pd.DataFrame({'a': [1, 1, 1], 'b': [1, 2, 3]})}

    process_handler(event)
    process_handler(event)  # Called twice

    keys_in_s3 = [
        x.key for x in s3_bucket.objects.all() if 'structured' in x.key
    ]
    expected_keys = [
        'data/test/structured/test/_common_metadata',
        'data/test/structured/test/_metadata',
        'data/test/structured/test/part.0.parquet',
    ]
    assert len(expected_keys) == len(keys_in_s3)
    assert all(
        [keys_in_s3[i] == expected_keys[i] for i in range(len(keys_in_s3))])
def test_insert_data_multiple_respondents(setup_queue_event,
                                          test_data_form_multiple_respondents,
                                          create_table_mock):
    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0),
                    data=test_data_form_multiple_respondents))

    processHandler(event, None)

    create_table_mock.assert_table_data_column(
        'google_forms_data', 'type',
        pd.Series([
            'TEXT',
            'PARAGRAPH_TEXT',
            'TEXT',
            'PARAGRAPH_TEXT',
        ]))

    create_table_mock.assert_table_data_column(
        'google_forms_data', 'responder',
        pd.Series([
            'test_user1',
            'test_user1',
            'test_user2',
            'test_user2',
        ]))
    create_table_mock.assert_table_data_column(
        'google_forms_data', 'is_quiz', pd.Series([False, False, False,
                                                   False]))
def test_handler_call_process_s3_parquet_partitioned_with_None_content_string(
        s3_bucket, setup_queue_event):
    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0), data=''))
    process_handler = handler.ProcessHandler()

    @process_handler.process(partitions={'test': ['a']})
    def test_process(data, events):
        return {
            'test': pd.DataFrame({
                'a': ['name0', 'name0', None],
                'b': [1, 2, 3]
            })
        }

    process_handler(event)

    keys_in_s3 = [
        x.key for x in s3_bucket.objects.all() if 'structured' in x.key
    ]
    expected_keys = [
        'data/test/structured/test/_common_metadata',
        'data/test/structured/test/_metadata',
        'data/test/structured/test/a=name0/part.0.parquet',
        'data/test/structured/test/a=undefined/part.0.parquet'
    ]

    assert all(
        [keys_in_s3[i] == expected_keys[i] for i in range(len(keys_in_s3))])
def test_process_table_content_missing_born_date(setup_queue_event, test_data, create_table_mock, dynamodb_resource):

    tmp_data = test_data['data']
    tmp_data[0]['cv'].pop('born_year', None)

    event = setup_queue_event(
        schema.Data(
            metadata=schema.Metadata(timestamp=0),
            data=tmp_data))

    handler(event, None)
    create_table_mock.assert_table_data_contains_df(
        'cv_partner_employees',
        pd.DataFrame({
            'user_id': ['user_id_1', 'user_id_2'],
            'guid': ['20dbbfa18380233aa643575720b893fac5137699', '491b9fa9bfac17563882b0fdc6f3a8a97417bd99'],
            'default_cv_id': ['user_id_1_cv_id', 'user_id_2_cv_id'],
            'link': ["link1", "link2"],
            'navn': ['Test Testerson', 'Test Testerson 2'],
            'email': ['*****@*****.**', '*****@*****.**'],
            'telefon': ['+123456', '+123456'],
            'born_year': [-1, 1985],
            'nationality': ["Norwegian", "Swedish"],
            'place_of_residence': ['Oslo', 'Oslo'],
            'twitter': ["", "twitter2"]
        }))
def test_handler_call_process_s3_parquet_append_partitioned(
        s3_bucket, setup_queue_event):
    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0), data=''))
    process_handler = handler.ProcessHandler()

    @process_handler.process(partitions={'test': ['a']})
    def test_process(data, events):
        return {'test': pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 3]})}

    process_handler(event)
    process_handler(event)  # Called twice

    keys_in_s3 = [
        x.key for x in s3_bucket.objects.all() if 'structured' in x.key
    ]
    expected_keys = [
        'data/test/structured/test/_common_metadata',
        'data/test/structured/test/_metadata',
        'data/test/structured/test/a=1/part.0.parquet',
        'data/test/structured/test/a=1/part.1.parquet',
        'data/test/structured/test/a=2/part.0.parquet',
        'data/test/structured/test/a=2/part.1.parquet',
    ]

    assert all(
        [keys_in_s3[i] == expected_keys[i] for i in range(len(keys_in_s3))])
示例#9
0
def test_s3_key(path, expected_key_pattern):
    s3 = aws.S3(access_path='abc/123')
    key = s3.put(schema.Data(metadata=schema.Metadata(timestamp=1234),
                             data=''),
                 path=path)

    assert re.fullmatch(expected_key_pattern, key)
def test_process_mixed_data(setup_queue_event, test_data3, create_table_mock):
    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data3))

    process_handler(event, None)

    create_table_mock.assert_table_data_column('test_test_b_enkel_test_ark_3',
                                               'c', pd.Series([5, 'a', 5]))
def test_insert_data_invalid_type(setup_queue_event, test_data_quiz_invalid,
                                  create_table_mock):
    invalid_file_quiz_event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0),
                    data=test_data_quiz_invalid))

    with pytest.raises(KeyError):
        processHandler(invalid_file_quiz_event, None)
示例#12
0
def test_process_data_table_created(mocker, create_table_mock,
                                    setup_queue_event, test_data):
    tmp_data = test_data['data']

    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0), data=tmp_data))

    handler(event, None)
    create_table_mock.assert_table_created('google_calendar_events')
示例#13
0
def test_process_data(create_table_mock, setup_queue_event, test_data):
    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0),
                    data=test_data['data']))

    handler(event, None)

    create_table_mock.assert_table_data_column('ubw_fagtimer', 'reg_period',
                                               pd.Series(['201817', '201907']))
def test_process_no_responses_no_data_added(setup_queue_event, test_data_empty,
                                            create_table_mock):
    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0),
                    data=test_data_empty))

    processHandler(event, None)

    create_table_mock.assert_table_not_created('google_forms_data')
示例#15
0
def test_process_data_poller(setup_queue_event, test_data_poller,
                             create_table_mock):
    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0),
                    data=test_data_poller['data']))

    handler(event, None)

    create_table_mock.assert_table_data_column('github_knowit_repos', 'id',
                                               pd.Series([4672898, 4730463]))
def test_handler_call_process(mocker, setup_queue_event):
    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0), data='hello test'))

    process_handler = handler.ProcessHandler()

    process_handler.wrapped_func['process'] = mocker.MagicMock(return_value={})
    process_handler(event)

    process_handler.wrapped_func['process'].assert_called_once()
    assert process_handler.wrapped_func['process'].call_args[0][0][0].json(
    )['data'] == 'hello test'
def test_process_parquet_get_user_data(create_table_mock, setup_queue_event, test_data):
    event = setup_queue_event(
        schema.Data(
            metadata=schema.Metadata(timestamp=0),
            data=test_data['data']))

    handler(event, None)

    create_table_mock.assert_table_data_column(
        'twitter_account_status_update',
        'user_id',
        pd.Series([1234, 1234, 1234, 1234]))
def test_process_data(create_table_mock, setup_queue_event, test_data):
    event = setup_queue_event(
        schema.Data(
            metadata=schema.Metadata(timestamp=0),
            data=test_data['data']))

    handler(event, None)

    create_table_mock.assert_table_created(
        'twitter_tweets',
        'twitter_timeline',
        'twitter_account_status_update')
def test_process_data(create_table_mock, setup_queue_event, test_data, dynamodb_resource):
    event = setup_queue_event(
        schema.Data(
            metadata=schema.Metadata(timestamp=0),
            data=test_data['data']))

    handler(event, None)

    create_table_mock.assert_table_data_column(
        'ubw_customer_per_resource',
        'reg_period',
        pd.Series(['202053', '202053', '202053']))
def test_handler_call_process_s3_parquet_schema_partition_change(
        s3_bucket, old_partitions, new_partitions, expected_dep_keys,
        expected_new_keys, setup_queue_event):
    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0), data=''))
    old_process_handler = handler.ProcessHandler()
    new_process_handler = handler.ProcessHandler()

    @old_process_handler.process(partitions={'test': old_partitions})
    def test_old_process(data, events):
        return {
            'test': pd.DataFrame({
                'a': [1, 1, 2],
                'b': [2, 2, 3],
                'c': [1, 2, 3]
            })
        }

    @new_process_handler.process(partitions={'test': new_partitions})
    def test_new_process(data, events):
        return {
            'test': pd.DataFrame({
                'a': [1, 1, 2],
                'b': [2, 2, 3],
                'c': [1, 2, 3]
            })
        }

    old_process_handler(event)
    new_process_handler(event)

    dep_keys_in_s3 = [
        x.key for x in s3_bucket.objects.all()
        if 'structured/deprecated' in x.key
    ]
    new_keys_in_s3 = [
        x.key for x in s3_bucket.objects.all()
        if 'structured' in x.key and 'deprecated' not in x.key
    ]

    expected_dep_keys = [
        'data\\/test\\/structured\\/deprecated\\/\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\/test\\/'
        + x for x in ['_common_metadata', '_metadata'] +
        [s.replace('/', '\\/') for s in expected_dep_keys]
    ]

    expected_new_keys = [
        f'data/test/structured/test/{x}'
        for x in ['_common_metadata', '_metadata'] + expected_new_keys
    ]

    assert all([new_keys_in_s3[i] == expected_new_keys[i] for i in range(len(new_keys_in_s3))]) \
        and all([re.fullmatch(expected_dep_keys[i], dep_keys_in_s3[i]) for i in range(len(dep_keys_in_s3))])
def test_handler_call_process_s3_parquet_overwrite(s3_bucket,
                                                   setup_queue_event):
    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0), data=''))

    process_handler = handler.ProcessHandler()

    def decorate_process_function(count):
        if (count == 0):

            @process_handler.process(partitions={'test': ['a']},
                                     overwrite=True)
            def test_process(data, events):
                return {'test': pd.DataFrame({'a': [1, 1, 1], 'c': [1, 2, 3]})}
        else:

            @process_handler.process(partitions={'test': ['a']},
                                     overwrite=True)
            def test_process(data, events):
                return {'test': pd.DataFrame({'a': [2, 2, 2], 'c': [1, 2, 3]})}

    decorate_process_function(0)
    process_handler(event)
    keys_in_s3_first_time = [
        x.key for x in s3_bucket.objects.all() if 'structured' in x.key
    ]
    expected_keys = [
        'data/test/structured/test/_common_metadata',
        'data/test/structured/test/_metadata',
        'data/test/structured/test/a=1/part.0.parquet',
    ]

    assert all([
        keys_in_s3_first_time[i] == expected_keys[i]
        for i in range(len(keys_in_s3_first_time))
    ])

    decorate_process_function(1)
    process_handler(event)

    keys_in_s3_second_time = [
        x.key for x in s3_bucket.objects.all() if 'structured' in x.key
    ]
    expected_keys = [
        'data/test/structured/test/_common_metadata',
        'data/test/structured/test/_metadata',
        'data/test/structured/test/a=2/part.0.parquet',
    ]

    assert all([
        keys_in_s3_second_time[i] == expected_keys[i]
        for i in range(len(keys_in_s3_second_time))
    ])
示例#22
0
def test_process_parquet_get_data(mocker, create_table_mock, setup_queue_event,
                                  test_data):
    tmp_data = test_data['data']

    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0), data=tmp_data))

    handler(event, None)

    create_table_mock.assert_table_data_column('google_calendar_events',
                                               'event_id',
                                               pd.Series(['1235', '1236']))
示例#23
0
def test_hander_process_data(s3_bucket, create_table_mock, setup_queue_event, test_data2):
    event = setup_queue_event(
        schema.Data(
            metadata=schema.Metadata(timestamp=0),
            data=test_data2['data']))

    handler(event, None)

    create_table_mock.assert_table_data_column(
        'yr_weather',
        'location',
        pd.Series(['Norway/Oslo/Oslo/Lakkegata']*24))
示例#24
0
def test_process_data_skip_existing(setup_queue_event, athena,
                                    test_data_poller, create_table_mock):
    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0),
                    data=test_data_poller['data']))

    athena.on_query('SELECT "id" FROM "github_knowit_repos"',
                    pd.DataFrame({'id': [4672898]}))

    handler(event, None)

    create_table_mock.assert_table_data_column('github_knowit_repos', 'id',
                                               pd.Series([4730463]))
def test_process_clean_data(setup_queue_event, test_data2, create_table_mock):
    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data2))

    process_handler(event, None)

    create_table_mock.assert_table_data_column('test2_test_b_enkel_test_ark_2',
                                               'a', pd.Series([1, 5, 1]))

    create_table_mock.assert_table_data_column('test2_test_b_enkel_test_ark_2',
                                               'b', pd.Series([4, 8, 4]))

    create_table_mock.assert_table_data_column('test2_test_b_enkel_test_ark_2',
                                               'c', pd.Series([5, 13, 5]))
def test_process_data(setup_queue_event, test_data, create_table_mock):
    tmp_data = test_data['data']

    event = setup_queue_event(
        schema.Data(
            metadata=schema.Metadata(timestamp=0),
            data=tmp_data))

    handler(event, None)

    create_table_mock.assert_table_data_column(
        'knowit_labs_blog_posts',
        'medium_id',
        pd.Series(['asdf', '1234']))
示例#27
0
def test_process_data_skip_existing(athena, create_table_mock,
                                    setup_queue_event, test_data):
    athena.on_query(
        'SELECT "reg_period" FROM "dev_test_database"."ubw_fagtimer"',
        pd.DataFrame({'reg_period': ['201817']}))

    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0),
                    data=test_data['data']))

    handler(event, None)

    create_table_mock.assert_table_data_column('ubw_fagtimer', 'reg_period',
                                               pd.Series(['201907']))
def test_set_guid_from_ad_data(s3_bucket, setup_queue_event, test_data, dynamodb_resource):
    tmp_data = test_data['data']
    tmp_data[1]['cv']['email'] = "*****@*****.**"
    event = setup_queue_event(
        schema.Data(
            metadata=schema.Metadata(timestamp=0),
            data=test_data['data']))

    handler(event, None)

    cv_partner_employees_object = s3_bucket.Object("data/test/structured/cv_partner_employees/part.0.parquet")
    cv_partner_employees = pd.read_parquet(BytesIO(cv_partner_employees_object.get()['Body'].read()))
    assert cv_partner_employees.loc[cv_partner_employees['user_id'] == "user_id_1"]['guid'][0] \
           == "20dbbfa18380233aa643575720b893fac5137699"
    assert len(cv_partner_employees) == 1
def test_insert_data_two_respondents(setup_queue_event, test_data_form,
                                     test_data_form2, create_table_mock):
    event1 = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0),
                    data=test_data_form))

    processHandler(event1, None)

    event2 = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0),
                    data=test_data_form2))

    processHandler(event2, None)

    create_table_mock.assert_table_data_column(
        'google_forms_data', 'type',
        pd.Series([
            'PARAGRAPH_TEXT', 'TEXT', 'MULTIPLE_CHOICE', 'CHECKBOX',
            'CHECKBOX', 'LIST', 'SCALE', 'GRID', 'GRID', 'CHECKBOX_GRID',
            'CHECKBOX_GRID', 'DATE', 'TIME', 'FILE_UPLOAD', 'PARAGRAPH_TEXT',
            'TEXT', 'MULTIPLE_CHOICE', 'CHECKBOX', 'LIST', 'SCALE', 'GRID',
            'GRID', 'CHECKBOX_GRID', 'CHECKBOX_GRID', 'DATE', 'TIME',
            'FILE_UPLOAD'
        ]))
def test_handler_call_process_overwrite_historical_data_overwrite_versions_person_data(
        s3_bucket, setup_queue_event, dynamodb_resource):
    event = setup_queue_event(
        schema.Data(metadata=schema.Metadata(timestamp=0), data=''))
    process_handler = person_data_handler(PersonIdentifierType.ALIAS)

    @process_handler.process(partitions={},
                             person_data_tables=['test'],
                             overwrite=False,
                             overwrite_all_versions=True,
                             historical_tables=['test_2'])
    def test_process(data, events):
        return {
            'test':
            pd.DataFrame({
                'alias': ['olanord', 'karnord', 'lisnord'],
                'b': [1, 2, 3]
            }),
            'test_2':
            pd.DataFrame({
                'a': [1, 1, 1],
                'b': [1, 2, 3]
            })
        }

    process_handler(event)
    keys_in_s3 = [
        x.key for x in s3_bucket.objects.all() if 'structured' in x.key
    ]
    process_handler(event)  # Called twice

    keys_in_s3 = [
        x.key for x in s3_bucket.objects.all() if 'structured' in x.key
    ]

    expected_keys = [
        'data/test/structured/test/_common_metadata',
        'data/test/structured/test/_metadata',
        'data/test/structured/test/part.0.parquet',
        'data/test/structured/test_2/_common_metadata',
        'data/test/structured/test_2/_metadata',
        'data/test/structured/test_2/part.0.parquet',
        'data/test/structured/test_2/part.1.parquet',
    ]

    assert len(expected_keys) == len(keys_in_s3)
    assert all(
        [keys_in_s3[i] == expected_keys[i] for i in range(len(keys_in_s3))])