def song_etl(): song_json_path = os.getcwd() + '/data/song_data/' file_finder = FileFinder(song_json_path, '*.json') data_loader = DataLoader(file_finder.return_file_names()) song_dataframe = data_loader.create_json_from_files() data_filter = DataFilter(song_dataframe) artist_dataset = data_filter.return_unique_dataframe_subset( ['artist_id', 'artist_name', 'artist_location', 'artist_longitude', 'artist_latitude'], ['artist_id', 'artist_name'] ) song_dataset = data_filter.return_unique_dataframe_subset( ['song_id', 'title', 'year', 'duration', 'artist_name'], ['song_id', 'title'] ) database_wrapper = DatabaseWrapper() database_wrapper.execute_batch_query( query['artist_insert'], list(artist_dataset.itertuples(index=False, name=None)) ) database_wrapper.execute_batch_query( query['song_insert'], list(song_dataset.itertuples(index=False, name=None)) )
def log_etl(): log_json_path = os.getcwd() + '/data/log_data/' file_finder = FileFinder(log_json_path, '*.json') data_loader = DataLoader(file_finder.return_file_names()) data_filter = DataFilter(data_loader.create_json_from_files()) user_set = data_filter.return_unique_dataframe_subset( ['firstName', 'lastName', 'gender', 'level'], ['firstName', 'lastName']) timestamp_data_set = data_filter.return_unique_dataframe_subset( ['ts', 'firstName', 'lastName']) songplay_dataset = data_filter.return_unique_dataframe_subset([ 'ts', 'firstName', 'lastName', 'level', 'song', 'artist', 'artist', 'sessionId' ]) database_wrapper = DatabaseWrapper() database_wrapper.execute_batch_query( query['user_insert'], list(user_set.itertuples(index=False, name=None))) database_wrapper.execute_batch_query( query['timestamp_insert'], list( map(unpack_timestamp, timestamp_data_set.itertuples(name=None, index=False)))) database_wrapper.execute_batch_query( query['songplay_insert'], list(songplay_dataset.itertuples(index=False, name=None)))