def test_with(self): t = MockTarget("foo") with t.open('w') as b: b.write("bar") with t.open('r') as b: self.assertEqual(list(b), ['bar'])
def test_preprocessing(self, output_mock, input_mock): # -------- SET UP MOCK DATA ------------ output_target = MockTarget('corpus_out', format=luigi.format.Nop) input_target = MockTarget('corpus_in', format=luigi.format.Nop) output_mock.return_value = output_target input_mock.return_value = input_target with input_target.open('w') as fp: pickle.dump([ Doc("Ich bin der erste Post über ein Kulturinstitut" "in der Landeshauptstadt"), Doc("Ich bin der 2 Post über mit Bezug zur " "Landeshauptstadt. toll "), Doc("Trallala noch ein Post 2 zum Museum"), Doc("noch weitere Posts zum weitere testen." 'Barberini toll'), Doc("this document is in english") ], fp) # ------- RUN TASK UNDER TEST -------- task = TopicModelingPreprocessCorpus() task.run() # ------- INSPECT OUTPUT ------- with output_target.open("r") as fp: output = pickle.load(fp) # nosec self.assertEqual(len(output), 2) self.assertEqual(output[0].tokens, ['post', 'landeshauptstadt', 'toll']) self.assertEqual(output[1].tokens, ['weitere', 'weitere', 'toll'])
def test_bytes(self): t = MockTarget("foo", format=Nop) with t.open('wb') as b: b.write(b"bar") with t.open('rb') as b: self.assertEqual(list(b), [b'bar'])
def test_default_mode_value(self): t = MockTarget("foo") with t.open('w') as b: b.write("bar") with t.open() as b: self.assertEqual(list(b), ['bar'])
def test_unicode(self): t = MockTarget("foo") with t.open('w') as b: b.write(u"bar") with t.open('r') as b: self.assertEqual(b.read(), u'bar')
def test_post_performance_transformation(self, input_mock, output_mock, request_mock): self.db_connector.execute( '''INSERT INTO ig_post (ig_post_id) VALUES (0123456789), (9876543210)''', '''INSERT INTO ig_post_performance VALUES (0123456789, '2019-10-04', 5, 4, 3, 2, 1, 2, 1, 0, 1, 0)''') input_target = MockTarget('posts_in', format=UTF8) input_mock.return_value = input_target output_target = MockTarget('insights_out', format=UTF8) output_mock.return_value = output_target with input_target.open('w') as posts_target: with open(f'{IG_TEST_DATA}/post_expected.csv', 'r', encoding='utf-8') as posts_input: posts_target.write(posts_input.read()) with open(f'{IG_TEST_DATA}/post_insights_video_actual.json', 'r', encoding='utf-8') as json_video_in: input_video_insights = json_video_in.read() with open(f'{IG_TEST_DATA}/post_insights_no_video_actual.json', 'r', encoding='utf-8') as json_no_video_in: input_no_video_insights = json_no_video_in.read() with open(f'{IG_TEST_DATA}/post_insights_expected.csv', 'r', encoding='utf-8') as expected_data_in: expected_df = pd.read_csv(expected_data_in) def mock_video_json(): return json.loads(input_video_insights) def mock_no_video_json(): return json.loads(input_no_video_insights) mock_video_response = MagicMock(ok=True, json=mock_video_json) mock_no_video_response = MagicMock(ok=True, json=mock_no_video_json) request_mock.side_effect = [ mock_video_response, mock_no_video_response ] with freeze_time('2020-01-01 00:00:05'): self.task = instagram.FetchIgPostPerformance( columns=[ column[0] for column in instagram.IgPostPerformanceToDb().columns ], timespan=dt.timedelta(days=100000), table='ig_post_performance') self.task.run() with output_target.open('r') as output_data: output_df = pd.read_csv(output_data) pd.testing.assert_frame_equal(expected_df, output_df)
def test_1(self): t = MockTarget('test') p = t.open('w') print('test', file=p) p.close() q = t.open('r') self.assertEqual(list(q), ['test\n']) q.close()
def test_scrape_order_contains(self, output_mock, input_mock): test_data = pd.read_csv( 'tests/test_data/gomus/scrape_order_contains_data.csv') html_file_names = [] all_order_ids = test_data['order_id'].drop_duplicates() for order_id in all_order_ids: new_html_task = FetchGomusHTML(url=f"/admin/orders/{order_id}") new_html_task.run() html_file_names.append(new_html_task.output().path) input_target = MockTarget('orders_htmls_out', format=UTF8) input_mock.return_value = input_target with input_target.open('w') as input_file: input_file.write('\n'.join(html_file_names)) output_target = MockTarget('scraped_order_contains_out', format=UTF8) output_mock.return_value = output_target # -- execute code under test -- ScrapeGomusOrderContains().run() # -- inspect results -- with output_target.open('r') as output_file: actual_output = pd.read_csv(output_file) self.assertEqual(len(test_data.index), len(actual_output.index)) for i in range(len(actual_output)): expected_row = test_data.iloc[i] actual_row = actual_output.iloc[i] # test if order stayed the same self.assertEqual(expected_row['order_id'], actual_row['order_id']) # test if scraped data is correct hash_string = ','.join([ str(actual_row['article_id']), str(actual_row['article_type']), str(actual_row['ticket']), str(actual_row['date']), str(actual_row['quantity']), str(actual_row['price']), str(actual_row['is_cancelled']) ]) actual_hash = mmh3.hash(hash_string, seed=self.hash_seed) self.assertEqual(actual_hash, expected_row['expected_hash'], msg=f"Scraper got wrong values:\n\ {str(actual_row) if sys.stdin.isatty() else 'REDACTED ON NON-TTY'}")
def test_exhibition_times(self, output_mock, requests_get_mock): output_target = MockTarget('exhibition_out', format=UTF8) output_mock.return_value = output_target with open('tests/test_data/gomus/exhibitions/exhibitions_actual.json', 'r', encoding='utf-8') as data_in: input_data = data_in.read() with open( 'tests/test_data/gomus/exhibitions/' 'exhibition_times_expected.csv', 'r', encoding='utf-8') as data_out: expected_data = data_out.read() # Overwrite requests 'get' return value to provide our test data def mock_json(): return json.loads(input_data) mock_response = MagicMock(ok=True, json=mock_json) requests_get_mock.return_value = mock_response FetchExhibitionTimes().run() with output_target.open('r') as output_data: self.assertEqual(expected_data, output_data.read())
def test_empty_tweet_performance(self, output_mock, raw_tweets_mock): output_target = MockTarget('perform_extracted_out', format=UTF8) output_mock.return_value = output_target with open( 'tests/test_data/twitter/empty_raw_tweets.csv', 'r', encoding='utf-8') as data_in: raw_tweets = data_in.read() with open( 'tests/test_data/twitter/empty_tweet_performance.csv', 'r', encoding='utf-8') as data_out: expected_performance = data_out.read() self.install_mock_target( raw_tweets_mock, lambda file: file.write(raw_tweets)) task = ExtractTweetPerformance(table='tweet_performance') task.run() with output_target.open('r') as output_file: output = output_file.read() self.assertEqual(output, expected_performance)
def test_extract_tweet_performance(self, output_mock, raw_tweets_mock): self.db_connector.execute(''' INSERT INTO tweet (tweet_id) VALUES ('1234567890123456789'), ('111111111111111111'), ('2222222222222222222') ''') output_target = MockTarget('perform_extracted_out', format=UTF8) output_mock.return_value = output_target with open('tests/test_data/twitter/raw_tweets.csv', 'r', encoding='utf-8') as data_in: raw_tweets = data_in.read() with open('tests/test_data/twitter/expected_tweet_performance.csv', 'r', encoding='utf-8') as data_out: expected_performance = data_out.read() self.install_mock_target(raw_tweets_mock, lambda file: file.write(raw_tweets)) task = ExtractTweetPerformance(table='tweet_performance') task.run() with output_target.open('r') as output_file: output = output_file.read() self.assertEqual( output.split('\n')[0], expected_performance.split('\n')[0]) for i in range(1, 3): self.assertEqual( # cutting away the timestamp output.split('\n')[i].split(';')[:-1], expected_performance.split('\n')[i].split(';')[:-1])
def test_post_transformation(self, fact_mock, output_mock, requests_get_mock): fact_target = MockTarget('facts_in', format=UTF8) fact_mock.return_value = fact_target output_target = MockTarget('post_out', format=UTF8) output_mock.return_value = output_target with open(f'{FB_TEST_DATA}/post_actual.json', 'r', encoding='utf-8') as data_in: input_data = data_in.read() with open(f'{FB_TEST_DATA}/post_expected.csv', 'r', encoding='utf-8') as data_out: expected_data = data_out.read() # Overwrite requests 'get' return value to provide our test data def mock_json(): return json.loads(input_data) mock_response = MagicMock(ok=True, json=mock_json) requests_get_mock.return_value = mock_response facebook.MuseumFacts().run() facebook.FetchFbPosts().run() with output_target.open('r') as output_data: self.assertEqual(expected_data, output_data.read())
def test_audience_gender_age_transformation(self, input_mock, output_mock, request_mock): fact_target = MockTarget('facts_in', format=UTF8) input_mock.return_value = fact_target output_target = MockTarget('post_out', format=UTF8) output_mock.return_value = output_target with open(f'{IG_TEST_DATA}/audience_gender_age_actual.json', 'r', encoding='utf-8') as data_in: input_data = data_in.read() with open(f'{IG_TEST_DATA}/audience_gender_age_expected.csv', 'r', encoding='utf-8') as data_out: expected_data = data_out.read() def mock_json(): return json.loads(input_data) mock_response = MagicMock(ok=True, json=mock_json) request_mock.return_value = mock_response instagram.MuseumFacts().run() with freeze_time('2020-01-01 00:00:05'): instagram.FetchIgAudienceGenderAge(columns=[ col[0] for col in instagram.IgAudienceGenderAgeToDb().columns ]).run() with output_target.open('r') as output_data: self.assertEqual(output_data.read(), expected_data)
def test_create_corpus(self, output_mock): # -------- SET UP MOCK DATA ------------ output_target = MockTarget('corpus_out', format=luigi.format.Nop) output_mock.return_value = output_target self.db_connector.execute( ''' INSERT INTO tweet(user_id,tweet_id,text,response_to,post_date) VALUES ('user_id', 'tweet_id', 'tweet text', NULL, '2020-05-24 10:56:21') ''', ''' INSERT INTO fb_post_comment(post_id,comment_id,post_date, text,is_from_museum,response_to) VALUES ('post1','comment1','2020-05-24 10:56:21', 'text1',false,NULL), ('post2','comment2','2018-05-24 10:56:21', 'text2',true,NULL) ''') # ------- RUN TASK UNDER TEST -------- task = TopicModelingCreateCorpus() task.run() # ------- INSPECT OUTPUT ------- with output_target.open("r") as fp: corpus = pickle.load(fp) # nosec self.assertEqual(len(corpus), 2) self.assertIsInstance(corpus[0], Doc) self.assertIsInstance(corpus[1], Doc)
def test_fetch_total_profile_metrics(self, fact_mock, output_mock, request_mock): fact_target = MockTarget('facts_in', format=UTF8) fact_mock.return_value = fact_target output_target = MockTarget('post_out', format=UTF8) output_mock.return_value = output_target with open(f'{IG_TEST_DATA}/total_profile_metrics_actual.json', 'r', encoding='utf-8') as data_in: input_data = data_in.read() with open(f'{IG_TEST_DATA}/total_profile_metrics_expected.csv', 'r', encoding='utf-8') as data_out: expected_data = data_out.read() def mock_json(): return json.loads(input_data) mock_response = MagicMock(ok=True, json=mock_json) request_mock.return_value = mock_response with freeze_time('2020-01-01 00:00:05'): self.run_task( instagram.FetchIgTotalProfileMetrics(columns=[ col[0] for col in instagram.IgTotalProfileMetricsToDb().columns ])) with output_target.open('r') as output_data: self.assertEqual(output_data.read(), expected_data)
def test_two(self, input_mock, output_mock): df_in0 = pd.DataFrame([[1, 'foo'], [2, 'bar']], columns=['a', 'b']) df_in1 = pd.DataFrame([[42, 'spam'], [1337, 'häm']], columns=['a', 'b']) input_mock.return_value = [ self.install_mock_target(MagicMock(), lambda file: df.to_csv(file, index=False)) for df in [df_in0, df_in1] ] output_target = MockTarget(str(self), format=UTF8) output_mock.return_value = output_target self.task = ConcatCsvs() self.run_task(self.task) with output_target.open('r') as output: df_out = pd.read_csv(output) df_expected = pd.DataFrame([ [1, 'foo'], [2, 'bar'], [42, 'spam'], [1337, 'häm'], ], columns=['a', 'b']) pd.testing.assert_frame_equal(df_expected, df_out)
def test_fetch_twitter(self, output_mock): output_target = MockTarget('raw_out', format=UTF8) output_mock.return_value = output_target # Dirty workaround for pandas's peculiarities regarding default values none = object() expected_tweet = { 'tweet_id': 1225435275301654531, 'text': "#MuseumBarberini is cool!", 'user_id': 1189538451097608193, 'parent_tweet_id': none, 'timestamp': '2020-02-06 16:05:11+01:00' } with freeze_time('2020-02-06'): # On this day our team's account has posted a related tweet # See https://twitter.com/bpfn2020/status/1225435275301654531 FetchTwitter(timespan=dt.timedelta(days=1)).run() with output_target.open('r') as output_file: output_df = pd.read_csv(output_file) output_df = output_df.fillna(none) filtered_df = output_df for key, value in expected_tweet.items(): filtered_df = filtered_df[filtered_df[key] == value] self.assertTrue( len(filtered_df) >= 1, f"Did not find any tweet with {key} = {value}")
def test_app_id(self, input_mock): input_target = MockTarget('museum_facts', format=UTF8) input_mock.return_value = input_target with input_target.open('w') as fp: json.dump({'ids': {'gplay': {'appId': 'some ID'}}}, fp) self.task._app_id = None app_id = FetchGplayReviews().app_id self.assertEqual(app_id, 'some ID')
def test_run(self, input_mock, output_mock, mock_fetch, mock_lang): input_target = MockTarget('museum_facts', format=UTF8) input_mock.return_value = input_target with input_target.open('w') as fp: json.dump( {'ids': {'gplay': { 'appId': 'com.barberini.museum.barberinidigital' }}}, fp ) output_target = MockTarget('gplay.gplay_reviews', format=UTF8) output_mock.return_value = output_target FetchGplayReviews().run() expected = pd.DataFrame([RESPONSE_ELEM_1_RENAMED_COLS]) with output_target.open('r') as output_file: actual = pd.read_csv(output_file) pd.testing.assert_frame_equal(expected, actual)
def test_one(self, input_mock, output_mock): df_in = pd.DataFrame([[1, 'foo'], [2, 'bar']], columns=['a', 'b']) self.install_mock_target(input_mock, lambda file: df_in.to_csv(file, index=False)) output_target = MockTarget(str(self)) output_mock.return_value = output_target self.task = ConcatCsvs() self.run_task(self.task) with output_target.open('r') as output: df_out = pd.read_csv(output) pd.testing.assert_frame_equal(df_in, df_out)
class GomusFormatTest(DatabaseTestCase): """The abstract base class for gomus format tests.""" def __init__(self, report, expected_format, *args, **kwargs): super().__init__(*args, **kwargs) self.report = report self.expected_format = expected_format def prepare_output_target(self, output_mock): self.output_target = MockTarget('data_out', format=UTF8) output_mock.return_value = [self.output_target] def fetch_gomus_report(self, suffix='_7days', sheet=[0]): self.run_task( FetchGomusReport(report=self.report, suffix=suffix, sheet_indices=sheet)) def check_format(self, skiprows=0, skipfooter=0): with self.output_target.open('r') as output_file: df = pd.read_csv(output_file, skipfooter=skipfooter, skiprows=skiprows, engine='python') for i in range(len(self.expected_format)): if df.columns[i] == 'Keine Daten vorhanden': break # this checks whether the columns are named right self.assertEqual(df.columns[i], self.expected_format[i][0]) df.apply(lambda x: self.check_type( x[self.expected_format[i][0]], self.expected_format[i][1]), axis=1) def check_type(self, data, expected_type): # To check if the data in the columns has the right type, # we try to converte the string into the expected type and # catch a ValueError or TypeError, if something goes wrong. # As we don't process data from type "STRING"/ just store # it as text, we don't have to explicitly check the type. try: if data == '': pass elif expected_type == 'FLOAT': float(data) elif expected_type == 'DATE': dt.datetime.strptime(data, '%d.%m.%Y') elif expected_type == 'TIME': dt.datetime.strptime(data, '%H:%M') except (ValueError, TypeError): self.assertTrue(False, f'{data} is not from type {expected_type}')
class DummyFileWrapper(luigi.Task): """Dummy task to write an output file.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.mock_target = MockTarget(f'DummyFileWrapperMock{hash(self)}', format=luigi.format.UTF8) csv = luigi.Parameter() def run(self): with self.mock_target.open('w') as input_file: input_file.write(self.csv) def output(self): return self.mock_target
def test_post_transformation(self, fact_mock, output_mock, request_mock): fact_target = MockTarget('facts_in', format=UTF8) fact_mock.return_value = fact_target output_target = MockTarget('post_out', format=UTF8) output_mock.return_value = output_target with open(f'{IG_TEST_DATA}/post_actual.json', 'r', encoding='utf-8') as data_in: input_data = data_in.read() expected_data = pd.read_csv(f'{IG_TEST_DATA}/post_expected.csv') request_mock.side_effect = lambda url: \ MagicMock(ok=True, json=lambda: json.loads(input_data)) self.run_task(instagram.FetchIgPosts()) with output_target.open('r') as output_data: actual_data = pd.read_csv(output_data) pd.testing.assert_frame_equal(actual_data, expected_data)
def iter_task(self, mock_codes, max_index): gen = self.task.run() dep = next(gen) for i, code in enumerate(mock_codes): self.assertIsInstance(dep, FetchGomusHTML) self.assertLessEqual(i, max_index) if 200 <= code < 300: target = MockTarget(f'quota_{i}.html') elif code in dep.ignored_status_codes: target = MockTarget(f'quota_{i}.html.error') else: raise ValueError("Unhandled status code") with target.open('w'): pass try: dep = gen.send(FailableTarget(target)) except StopIteration: dep = None break self.assertFalse(dep)
def test_audience_origin_transformation(self, input_mock, output_mock, request_mock): fact_target = MockTarget('facts_in', format=UTF8) input_mock.return_value = fact_target output_target = MockTarget('post_out', format=UTF8) output_mock.return_value = output_target with open(f'{IG_TEST_DATA}/audience_origin_actual.json', 'r', encoding='utf-8') as data_in: input_data = data_in.read() with open(f'{IG_TEST_DATA}/audience_origin_expected.csv', 'r', encoding='utf-8') as data_out: expected_data = data_out.read() def mock_json(): return json.loads(input_data) mock_response = MagicMock(ok=True, json=mock_json) request_mock.return_value = mock_response instagram.MuseumFacts().run() with freeze_time('2020-01-01 00:00:05'): # Use city mode for testing, though the # transformation is the same for countries # The only difference between the two is the received, # data, which cannot be tested here instagram.FetchIgAudienceOrigin(columns=[ col[0] for col in instagram.IgAudienceCityToDb().columns ], country_mode=False).run() with output_target.open('r') as output_data: self.assertEqual(output_data.read(), expected_data)
def prepare_post_performance_mocks(self, input_mock, output_mock, requests_get_mock, actual_json): input_target = MockTarget('posts_in', format=UTF8) input_mock.return_value = input_target output_target = MockTarget('insights_out', format=UTF8) output_mock.return_value = output_target with input_target.open('w') as posts_target: with open(f'{FB_TEST_DATA}/post_expected_single.csv', 'r', encoding='utf-8') as posts_input: posts_target.write(posts_input.read()) with open(f'{FB_TEST_DATA}/{actual_json}', 'r', encoding='utf-8') as json_in: input_json = json_in.read() def mock_json(): return json.loads(input_json) mock_response = MagicMock(ok=True, json=mock_json) requests_get_mock.return_value = mock_response return output_target
def test_post_transformation(self, fact_mock, output_mock, request_mock): fact_target = MockTarget('facts_in', format=UTF8) fact_mock.return_value = fact_target output_target = MockTarget('post_out', format=UTF8) output_mock.return_value = output_target with open(f'{IG_TEST_DATA}/post_actual.json', 'r', encoding='utf-8') as data_in: input_data = data_in.read() with open(f'{IG_TEST_DATA}/post_expected.csv', 'r', encoding='utf-8') as data_out: expected_data = data_out.read() def mock_json(): return json.loads(input_data) mock_response = MagicMock(ok=True, json=mock_json) request_mock.return_value = mock_response self.run_task(instagram.FetchIgPosts()) with output_target.open('r') as output_data: self.assertEqual(output_data.read(), expected_data)
def _touch(self, path): t = MockTarget(path) with t.open('w'): pass
def test_mode_none_error(self): t = MockTarget("foo") with self.assertRaises(TypeError): with t.open(None) as b: b.write("bar")
def test_fetch_twitter(self, output_mock): """Integration test! We post a real tweet and then try to fetch it.""" output_target = MockTarget('raw_out', format=UTF8) output_mock.return_value = output_target sample = "TestBarberiniAnalyticsFetchTwitter" \ + self.generate_random_hex_string(12) text = ( f"This is an automated random tweet for integration testing of " "BarberiniAnalytics.\n\n" f"{sample}\n\n" "For more information, see: https://github.com/Museum-Barberini/" "Barberini-Analytics/blob/master/tests/test_twitter.py" ) # ARRANGE tweet = self.post_tweet(text) time.sleep(3) # Wait for the tweet to be processed # ACT FetchTwitter( query=sample, timespan=dt.timedelta(days=1) ).run() with output_target.open('r') as output_file: output_df = pd.read_csv(output_file) # Dirty workaround for pandas's peculiarities regarding default values none = object() output_df = output_df.fillna(none) output_df['text'] = output_df['text'].apply( lambda text: re.sub(r'\s+', ' ', text) ) output_df['timestamp'] = output_df['timestamp'].apply( date_parser.parse) # ASSERT expected_tweet = { 'tweet_id': tweet['id'], 'user_id': tweet['user_id'], 'parent_tweet_id': none, 'timestamp': tweet['created_at'] } expected_text = re.sub(r'\s+', ' ', tweet['text']) text_predicates = [ lambda text: text.startswith(expected_text.split('…')[0]), lambda text: sample in text ] filtered_df = output_df for key, value in expected_tweet.items(): previous_df = filtered_df filtered_df = filtered_df[filtered_df[key] == value] self.assertTrue( len(filtered_df) >= 1, f"Did not find any tweet with {key} = {value}, " f"values are: {previous_df[key]}") for index, predicate in enumerate(text_predicates): previous_df = filtered_df filtered_df = filtered_df[filtered_df['text'].apply(predicate)] self.assertTrue( len(filtered_df) >= 1, f"Did not find any tweet with text matching predicate {index}" f"\n\nValues are: {previous_df['text']}")