def load_dataframe_from_files(file_list): ''' data_loader receives the list of files from the file_finder aggregates the files into a list and returns a dataframe from the list for song and log data ''' data_loader = DataLoader(file_list) return data_loader.create_dataframe_from_files()
def create_aggregate_csv(): event_columns = [ 'sessionId', 'itemInSession', 'artist', 'firstName', 'gender', 'lastName', 'length', 'level', 'location', 'song', 'userId' ] user_columns = [ 'userId', 'sessionId', 'artist', 'firstName', 'gender', 'itemInSession', 'lastName', 'length', 'level', 'location', 'song' ] user_and_song_columns = ['song', 'firstName', 'lastName'] event_data_path = os.getcwd() + '/event_data' aggregate_csv_path = os.getcwd() + '/event_datafile_new.csv' user_and_session_path = os.getcwd() + '/user_and_session.csv' user_and_song_path = os.getcwd() + '/user_and_song.csv' file_finder = FileFinder(event_data_path, '*.csv') all_csv_files = file_finder.return_file_names() data_loader = DataLoader(all_csv_files) csv_dataframe = data_loader.create_dataframe_from_files() event_frame = csv_dataframe[ csv_dataframe.itemInSession.apply(has_hashable_key) & csv_dataframe.sessionId.apply(has_hashable_key)] user_frame = csv_dataframe[ csv_dataframe.userId.apply(has_hashable_key) & csv_dataframe.sessionId.apply(has_hashable_key)] user_and_song_frame = csv_dataframe[csv_dataframe.song.apply( has_hashable_key)] event_frame[event_columns].to_csv(path_or_buf=aggregate_csv_path, index=False) user_frame[user_columns].to_csv(path_or_buf=user_and_session_path, index=False) user_and_song_frame[user_and_song_columns].to_csv( path_or_buf=user_and_song_path, index=False)
def s3_to_gzip(data_type, columns, path): logger = logging.getLogger(__name__) temp_path = path + f'/tmp/{data_type}' staging_path = path + '/staging' logger.info(f'Syncing {data_type} to {temp_path}') if not os.path.exists(temp_path): os.makedirs(temp_path) subprocess.run(f'aws s3 sync s3://udacity-dend/{data_type} {temp_path}', shell=True, check=True) file_finder = FileFinder(temp_path+ '/', '*.json') file_names = list(file_finder.return_file_names()) data_loader = DataLoader(file_names) dataframe = data_loader.create_dataframe_from_files() clean_dataframe_of_non_alphanumeric_characters(dataframe, columns) logger.info(f'saving {data_type} staging file to {staging_path}') if not os.path.exists(staging_path): os.makedirs(staging_path) dataframe.to_csv( staging_path + f'/{data_type}.gz', header=False, index=False, compression='gzip' ) dataframe.to_csv( staging_path + f'/{data_type}.csv', header=True, index=False )