def test_process_parquet_add_twice(setup_queue_event, create_table_mock, test_data1, test_data2): event1 = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data1)) process_handler(event1, None) event2 = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data2)) process_handler(event2, None) create_table_mock.assert_table_data_column( 'google_sheets_metadata', 'uploaded_by_user', pd.Series(['[email protected]', '[email protected]']))
def test_insert_data_quiz_process(setup_queue_event, test_data_quiz, create_table_mock): event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data_quiz)) processHandler(event, None) create_table_mock.assert_table_data_column( 'google_forms_data', 'type', pd.Series([ 'SCALE', 'MULTIPLE_CHOICE', 'TEXT', 'CHECKBOX', 'MULTIPLE_CHOICE', 'LIST', 'DATE', 'TIME' ])) create_table_mock.assert_table_data_column( 'google_forms_data', 'form_name', pd.Series([ 'test_quiz', 'test_quiz', 'test_quiz', 'test_quiz', 'test_quiz', 'test_quiz', 'test_quiz', 'test_quiz' ])) create_table_mock.assert_table_data_column( 'google_forms_data', 'uploaded_by_user', pd.Series([ 'test_person', 'test_person', 'test_person', 'test_person', 'test_person', 'test_person', 'test_person', 'test_person' ])) create_table_mock.assert_table_data_column( 'google_forms_data', 'is_quiz', pd.Series([True, True, True, True, True, True, True, True]))
def test_s3_put_data(s3_bucket): s3 = aws.S3() key = s3.put( schema.Data(metadata=schema.Metadata(timestamp=0), data='test')) res = schema.Data.from_json(s3_bucket.Object(key).get()['Body'].read()) assert res.data == 'test'
def test_handler_call_process_overwrite_all_versions_empty_historical_data( s3_bucket, setup_queue_event): event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data='')) process_handler = handler.ProcessHandler() @process_handler.process(partitions={}, overwrite=False, overwrite_all_versions=True, historical_tables=[]) def test_process(data, events): return {'test': pd.DataFrame({'a': [1, 1, 1], 'b': [1, 2, 3]})} process_handler(event) process_handler(event) # Called twice keys_in_s3 = [ x.key for x in s3_bucket.objects.all() if 'structured' in x.key ] expected_keys = [ 'data/test/structured/test/_common_metadata', 'data/test/structured/test/_metadata', 'data/test/structured/test/part.0.parquet', ] assert len(expected_keys) == len(keys_in_s3) assert all( [keys_in_s3[i] == expected_keys[i] for i in range(len(keys_in_s3))])
def test_insert_data_multiple_respondents(setup_queue_event, test_data_form_multiple_respondents, create_table_mock): event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data_form_multiple_respondents)) processHandler(event, None) create_table_mock.assert_table_data_column( 'google_forms_data', 'type', pd.Series([ 'TEXT', 'PARAGRAPH_TEXT', 'TEXT', 'PARAGRAPH_TEXT', ])) create_table_mock.assert_table_data_column( 'google_forms_data', 'responder', pd.Series([ 'test_user1', 'test_user1', 'test_user2', 'test_user2', ])) create_table_mock.assert_table_data_column( 'google_forms_data', 'is_quiz', pd.Series([False, False, False, False]))
def test_handler_call_process_s3_parquet_partitioned_with_None_content_string( s3_bucket, setup_queue_event): event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data='')) process_handler = handler.ProcessHandler() @process_handler.process(partitions={'test': ['a']}) def test_process(data, events): return { 'test': pd.DataFrame({ 'a': ['name0', 'name0', None], 'b': [1, 2, 3] }) } process_handler(event) keys_in_s3 = [ x.key for x in s3_bucket.objects.all() if 'structured' in x.key ] expected_keys = [ 'data/test/structured/test/_common_metadata', 'data/test/structured/test/_metadata', 'data/test/structured/test/a=name0/part.0.parquet', 'data/test/structured/test/a=undefined/part.0.parquet' ] assert all( [keys_in_s3[i] == expected_keys[i] for i in range(len(keys_in_s3))])
def test_process_table_content_missing_born_date(setup_queue_event, test_data, create_table_mock, dynamodb_resource): tmp_data = test_data['data'] tmp_data[0]['cv'].pop('born_year', None) event = setup_queue_event( schema.Data( metadata=schema.Metadata(timestamp=0), data=tmp_data)) handler(event, None) create_table_mock.assert_table_data_contains_df( 'cv_partner_employees', pd.DataFrame({ 'user_id': ['user_id_1', 'user_id_2'], 'guid': ['20dbbfa18380233aa643575720b893fac5137699', '491b9fa9bfac17563882b0fdc6f3a8a97417bd99'], 'default_cv_id': ['user_id_1_cv_id', 'user_id_2_cv_id'], 'link': ["link1", "link2"], 'navn': ['Test Testerson', 'Test Testerson 2'], 'email': ['*****@*****.**', '*****@*****.**'], 'telefon': ['+123456', '+123456'], 'born_year': [-1, 1985], 'nationality': ["Norwegian", "Swedish"], 'place_of_residence': ['Oslo', 'Oslo'], 'twitter': ["", "twitter2"] }))
def test_handler_call_process_s3_parquet_append_partitioned( s3_bucket, setup_queue_event): event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data='')) process_handler = handler.ProcessHandler() @process_handler.process(partitions={'test': ['a']}) def test_process(data, events): return {'test': pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 3]})} process_handler(event) process_handler(event) # Called twice keys_in_s3 = [ x.key for x in s3_bucket.objects.all() if 'structured' in x.key ] expected_keys = [ 'data/test/structured/test/_common_metadata', 'data/test/structured/test/_metadata', 'data/test/structured/test/a=1/part.0.parquet', 'data/test/structured/test/a=1/part.1.parquet', 'data/test/structured/test/a=2/part.0.parquet', 'data/test/structured/test/a=2/part.1.parquet', ] assert all( [keys_in_s3[i] == expected_keys[i] for i in range(len(keys_in_s3))])
def test_s3_key(path, expected_key_pattern): s3 = aws.S3(access_path='abc/123') key = s3.put(schema.Data(metadata=schema.Metadata(timestamp=1234), data=''), path=path) assert re.fullmatch(expected_key_pattern, key)
def test_process_mixed_data(setup_queue_event, test_data3, create_table_mock): event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data3)) process_handler(event, None) create_table_mock.assert_table_data_column('test_test_b_enkel_test_ark_3', 'c', pd.Series([5, 'a', 5]))
def test_insert_data_invalid_type(setup_queue_event, test_data_quiz_invalid, create_table_mock): invalid_file_quiz_event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data_quiz_invalid)) with pytest.raises(KeyError): processHandler(invalid_file_quiz_event, None)
def test_process_data_table_created(mocker, create_table_mock, setup_queue_event, test_data): tmp_data = test_data['data'] event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data=tmp_data)) handler(event, None) create_table_mock.assert_table_created('google_calendar_events')
def test_process_data(create_table_mock, setup_queue_event, test_data): event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data['data'])) handler(event, None) create_table_mock.assert_table_data_column('ubw_fagtimer', 'reg_period', pd.Series(['201817', '201907']))
def test_process_no_responses_no_data_added(setup_queue_event, test_data_empty, create_table_mock): event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data_empty)) processHandler(event, None) create_table_mock.assert_table_not_created('google_forms_data')
def test_process_data_poller(setup_queue_event, test_data_poller, create_table_mock): event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data_poller['data'])) handler(event, None) create_table_mock.assert_table_data_column('github_knowit_repos', 'id', pd.Series([4672898, 4730463]))
def test_handler_call_process(mocker, setup_queue_event): event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data='hello test')) process_handler = handler.ProcessHandler() process_handler.wrapped_func['process'] = mocker.MagicMock(return_value={}) process_handler(event) process_handler.wrapped_func['process'].assert_called_once() assert process_handler.wrapped_func['process'].call_args[0][0][0].json( )['data'] == 'hello test'
def test_process_parquet_get_user_data(create_table_mock, setup_queue_event, test_data): event = setup_queue_event( schema.Data( metadata=schema.Metadata(timestamp=0), data=test_data['data'])) handler(event, None) create_table_mock.assert_table_data_column( 'twitter_account_status_update', 'user_id', pd.Series([1234, 1234, 1234, 1234]))
def test_process_data(create_table_mock, setup_queue_event, test_data): event = setup_queue_event( schema.Data( metadata=schema.Metadata(timestamp=0), data=test_data['data'])) handler(event, None) create_table_mock.assert_table_created( 'twitter_tweets', 'twitter_timeline', 'twitter_account_status_update')
def test_process_data(create_table_mock, setup_queue_event, test_data, dynamodb_resource): event = setup_queue_event( schema.Data( metadata=schema.Metadata(timestamp=0), data=test_data['data'])) handler(event, None) create_table_mock.assert_table_data_column( 'ubw_customer_per_resource', 'reg_period', pd.Series(['202053', '202053', '202053']))
def test_handler_call_process_s3_parquet_schema_partition_change( s3_bucket, old_partitions, new_partitions, expected_dep_keys, expected_new_keys, setup_queue_event): event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data='')) old_process_handler = handler.ProcessHandler() new_process_handler = handler.ProcessHandler() @old_process_handler.process(partitions={'test': old_partitions}) def test_old_process(data, events): return { 'test': pd.DataFrame({ 'a': [1, 1, 2], 'b': [2, 2, 3], 'c': [1, 2, 3] }) } @new_process_handler.process(partitions={'test': new_partitions}) def test_new_process(data, events): return { 'test': pd.DataFrame({ 'a': [1, 1, 2], 'b': [2, 2, 3], 'c': [1, 2, 3] }) } old_process_handler(event) new_process_handler(event) dep_keys_in_s3 = [ x.key for x in s3_bucket.objects.all() if 'structured/deprecated' in x.key ] new_keys_in_s3 = [ x.key for x in s3_bucket.objects.all() if 'structured' in x.key and 'deprecated' not in x.key ] expected_dep_keys = [ 'data\\/test\\/structured\\/deprecated\\/\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\/test\\/' + x for x in ['_common_metadata', '_metadata'] + [s.replace('/', '\\/') for s in expected_dep_keys] ] expected_new_keys = [ f'data/test/structured/test/{x}' for x in ['_common_metadata', '_metadata'] + expected_new_keys ] assert all([new_keys_in_s3[i] == expected_new_keys[i] for i in range(len(new_keys_in_s3))]) \ and all([re.fullmatch(expected_dep_keys[i], dep_keys_in_s3[i]) for i in range(len(dep_keys_in_s3))])
def test_handler_call_process_s3_parquet_overwrite(s3_bucket, setup_queue_event): event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data='')) process_handler = handler.ProcessHandler() def decorate_process_function(count): if (count == 0): @process_handler.process(partitions={'test': ['a']}, overwrite=True) def test_process(data, events): return {'test': pd.DataFrame({'a': [1, 1, 1], 'c': [1, 2, 3]})} else: @process_handler.process(partitions={'test': ['a']}, overwrite=True) def test_process(data, events): return {'test': pd.DataFrame({'a': [2, 2, 2], 'c': [1, 2, 3]})} decorate_process_function(0) process_handler(event) keys_in_s3_first_time = [ x.key for x in s3_bucket.objects.all() if 'structured' in x.key ] expected_keys = [ 'data/test/structured/test/_common_metadata', 'data/test/structured/test/_metadata', 'data/test/structured/test/a=1/part.0.parquet', ] assert all([ keys_in_s3_first_time[i] == expected_keys[i] for i in range(len(keys_in_s3_first_time)) ]) decorate_process_function(1) process_handler(event) keys_in_s3_second_time = [ x.key for x in s3_bucket.objects.all() if 'structured' in x.key ] expected_keys = [ 'data/test/structured/test/_common_metadata', 'data/test/structured/test/_metadata', 'data/test/structured/test/a=2/part.0.parquet', ] assert all([ keys_in_s3_second_time[i] == expected_keys[i] for i in range(len(keys_in_s3_second_time)) ])
def test_process_parquet_get_data(mocker, create_table_mock, setup_queue_event, test_data): tmp_data = test_data['data'] event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data=tmp_data)) handler(event, None) create_table_mock.assert_table_data_column('google_calendar_events', 'event_id', pd.Series(['1235', '1236']))
def test_hander_process_data(s3_bucket, create_table_mock, setup_queue_event, test_data2): event = setup_queue_event( schema.Data( metadata=schema.Metadata(timestamp=0), data=test_data2['data'])) handler(event, None) create_table_mock.assert_table_data_column( 'yr_weather', 'location', pd.Series(['Norway/Oslo/Oslo/Lakkegata']*24))
def test_process_data_skip_existing(setup_queue_event, athena, test_data_poller, create_table_mock): event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data_poller['data'])) athena.on_query('SELECT "id" FROM "github_knowit_repos"', pd.DataFrame({'id': [4672898]})) handler(event, None) create_table_mock.assert_table_data_column('github_knowit_repos', 'id', pd.Series([4730463]))
def test_process_clean_data(setup_queue_event, test_data2, create_table_mock): event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data2)) process_handler(event, None) create_table_mock.assert_table_data_column('test2_test_b_enkel_test_ark_2', 'a', pd.Series([1, 5, 1])) create_table_mock.assert_table_data_column('test2_test_b_enkel_test_ark_2', 'b', pd.Series([4, 8, 4])) create_table_mock.assert_table_data_column('test2_test_b_enkel_test_ark_2', 'c', pd.Series([5, 13, 5]))
def test_process_data(setup_queue_event, test_data, create_table_mock): tmp_data = test_data['data'] event = setup_queue_event( schema.Data( metadata=schema.Metadata(timestamp=0), data=tmp_data)) handler(event, None) create_table_mock.assert_table_data_column( 'knowit_labs_blog_posts', 'medium_id', pd.Series(['asdf', '1234']))
def test_process_data_skip_existing(athena, create_table_mock, setup_queue_event, test_data): athena.on_query( 'SELECT "reg_period" FROM "dev_test_database"."ubw_fagtimer"', pd.DataFrame({'reg_period': ['201817']})) event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data['data'])) handler(event, None) create_table_mock.assert_table_data_column('ubw_fagtimer', 'reg_period', pd.Series(['201907']))
def test_set_guid_from_ad_data(s3_bucket, setup_queue_event, test_data, dynamodb_resource): tmp_data = test_data['data'] tmp_data[1]['cv']['email'] = "*****@*****.**" event = setup_queue_event( schema.Data( metadata=schema.Metadata(timestamp=0), data=test_data['data'])) handler(event, None) cv_partner_employees_object = s3_bucket.Object("data/test/structured/cv_partner_employees/part.0.parquet") cv_partner_employees = pd.read_parquet(BytesIO(cv_partner_employees_object.get()['Body'].read())) assert cv_partner_employees.loc[cv_partner_employees['user_id'] == "user_id_1"]['guid'][0] \ == "20dbbfa18380233aa643575720b893fac5137699" assert len(cv_partner_employees) == 1
def test_insert_data_two_respondents(setup_queue_event, test_data_form, test_data_form2, create_table_mock): event1 = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data_form)) processHandler(event1, None) event2 = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data=test_data_form2)) processHandler(event2, None) create_table_mock.assert_table_data_column( 'google_forms_data', 'type', pd.Series([ 'PARAGRAPH_TEXT', 'TEXT', 'MULTIPLE_CHOICE', 'CHECKBOX', 'CHECKBOX', 'LIST', 'SCALE', 'GRID', 'GRID', 'CHECKBOX_GRID', 'CHECKBOX_GRID', 'DATE', 'TIME', 'FILE_UPLOAD', 'PARAGRAPH_TEXT', 'TEXT', 'MULTIPLE_CHOICE', 'CHECKBOX', 'LIST', 'SCALE', 'GRID', 'GRID', 'CHECKBOX_GRID', 'CHECKBOX_GRID', 'DATE', 'TIME', 'FILE_UPLOAD' ]))
def test_handler_call_process_overwrite_historical_data_overwrite_versions_person_data( s3_bucket, setup_queue_event, dynamodb_resource): event = setup_queue_event( schema.Data(metadata=schema.Metadata(timestamp=0), data='')) process_handler = person_data_handler(PersonIdentifierType.ALIAS) @process_handler.process(partitions={}, person_data_tables=['test'], overwrite=False, overwrite_all_versions=True, historical_tables=['test_2']) def test_process(data, events): return { 'test': pd.DataFrame({ 'alias': ['olanord', 'karnord', 'lisnord'], 'b': [1, 2, 3] }), 'test_2': pd.DataFrame({ 'a': [1, 1, 1], 'b': [1, 2, 3] }) } process_handler(event) keys_in_s3 = [ x.key for x in s3_bucket.objects.all() if 'structured' in x.key ] process_handler(event) # Called twice keys_in_s3 = [ x.key for x in s3_bucket.objects.all() if 'structured' in x.key ] expected_keys = [ 'data/test/structured/test/_common_metadata', 'data/test/structured/test/_metadata', 'data/test/structured/test/part.0.parquet', 'data/test/structured/test_2/_common_metadata', 'data/test/structured/test_2/_metadata', 'data/test/structured/test_2/part.0.parquet', 'data/test/structured/test_2/part.1.parquet', ] assert len(expected_keys) == len(keys_in_s3) assert all( [keys_in_s3[i] == expected_keys[i] for i in range(len(keys_in_s3))])