def load_dataframe_from_files(file_list):
    '''
  data_loader receives the list of files from the file_finder
  aggregates the files into a list and returns a dataframe from the list for song and log data
  '''
    data_loader = DataLoader(file_list)
    return data_loader.create_dataframe_from_files()
def create_aggregate_csv():

    event_columns = [
        'sessionId', 'itemInSession', 'artist', 'firstName', 'gender',
        'lastName', 'length', 'level', 'location', 'song', 'userId'
    ]
    user_columns = [
        'userId', 'sessionId', 'artist', 'firstName', 'gender',
        'itemInSession', 'lastName', 'length', 'level', 'location', 'song'
    ]
    user_and_song_columns = ['song', 'firstName', 'lastName']

    event_data_path = os.getcwd() + '/event_data'
    aggregate_csv_path = os.getcwd() + '/event_datafile_new.csv'
    user_and_session_path = os.getcwd() + '/user_and_session.csv'
    user_and_song_path = os.getcwd() + '/user_and_song.csv'

    file_finder = FileFinder(event_data_path, '*.csv')
    all_csv_files = file_finder.return_file_names()
    data_loader = DataLoader(all_csv_files)
    csv_dataframe = data_loader.create_dataframe_from_files()

    event_frame = csv_dataframe[
        csv_dataframe.itemInSession.apply(has_hashable_key)
        & csv_dataframe.sessionId.apply(has_hashable_key)]

    user_frame = csv_dataframe[
        csv_dataframe.userId.apply(has_hashable_key)
        & csv_dataframe.sessionId.apply(has_hashable_key)]

    user_and_song_frame = csv_dataframe[csv_dataframe.song.apply(
        has_hashable_key)]

    event_frame[event_columns].to_csv(path_or_buf=aggregate_csv_path,
                                      index=False)

    user_frame[user_columns].to_csv(path_or_buf=user_and_session_path,
                                    index=False)

    user_and_song_frame[user_and_song_columns].to_csv(
        path_or_buf=user_and_song_path, index=False)
예제 #3
0
def s3_to_gzip(data_type, columns, path):
  logger = logging.getLogger(__name__)

  temp_path = path + f'/tmp/{data_type}'
  staging_path = path + '/staging'
  logger.info(f'Syncing {data_type} to {temp_path}')

  if not os.path.exists(temp_path):
    os.makedirs(temp_path)

  subprocess.run(f'aws s3 sync s3://udacity-dend/{data_type} {temp_path}', shell=True, check=True)

  file_finder = FileFinder(temp_path+ '/', '*.json')
  file_names = list(file_finder.return_file_names())

  data_loader = DataLoader(file_names)

  dataframe = data_loader.create_dataframe_from_files()

  clean_dataframe_of_non_alphanumeric_characters(dataframe, columns)

  logger.info(f'saving {data_type} staging file to {staging_path}')
  if not os.path.exists(staging_path):
    os.makedirs(staging_path)

  dataframe.to_csv(
    staging_path + f'/{data_type}.gz',
    header=False,
    index=False,
    compression='gzip'
  )

  dataframe.to_csv(
    staging_path + f'/{data_type}.csv',
    header=True,
    index=False
  )