예제 #1
0
def song_etl():

	song_json_path = os.getcwd() + '/data/song_data/'
	file_finder = FileFinder(song_json_path, '*.json')
	data_loader = DataLoader(file_finder.return_file_names())
	song_dataframe = data_loader.create_json_from_files()
	data_filter = DataFilter(song_dataframe)

	artist_dataset = data_filter.return_unique_dataframe_subset(
	  ['artist_id', 'artist_name', 'artist_location', 'artist_longitude', 'artist_latitude'], 
		['artist_id', 'artist_name']
	)

	song_dataset = data_filter.return_unique_dataframe_subset(
		['song_id', 'title', 'year', 'duration', 'artist_name'], 
		['song_id', 'title']
	)

	database_wrapper = DatabaseWrapper()

	database_wrapper.execute_batch_query(
		query['artist_insert'], 
		list(artist_dataset.itertuples(index=False, name=None))
	)

	database_wrapper.execute_batch_query(
		query['song_insert'], 
		list(song_dataset.itertuples(index=False, name=None))
	)
예제 #2
0
def log_etl():

    log_json_path = os.getcwd() + '/data/log_data/'
    file_finder = FileFinder(log_json_path, '*.json')
    data_loader = DataLoader(file_finder.return_file_names())
    data_filter = DataFilter(data_loader.create_json_from_files())

    user_set = data_filter.return_unique_dataframe_subset(
        ['firstName', 'lastName', 'gender', 'level'],
        ['firstName', 'lastName'])

    timestamp_data_set = data_filter.return_unique_dataframe_subset(
        ['ts', 'firstName', 'lastName'])

    songplay_dataset = data_filter.return_unique_dataframe_subset([
        'ts', 'firstName', 'lastName', 'level', 'song', 'artist', 'artist',
        'sessionId'
    ])

    database_wrapper = DatabaseWrapper()

    database_wrapper.execute_batch_query(
        query['user_insert'], list(user_set.itertuples(index=False,
                                                       name=None)))

    database_wrapper.execute_batch_query(
        query['timestamp_insert'],
        list(
            map(unpack_timestamp,
                timestamp_data_set.itertuples(name=None, index=False))))

    database_wrapper.execute_batch_query(
        query['songplay_insert'],
        list(songplay_dataset.itertuples(index=False, name=None)))