예제 #1
0
def process_imdb():
    """
    Processing IMDb dataset: download, merge and clean data, store
    final dataframe and unique genres.
    """
    # Download the data and store the cleaned and merged datasets.
    df_imdb_titles = hf.download_gz_file(IMDB_TITLES_URL)
    df_imdb_ratings = hf.download_gz_file(IMDB_RATINGS_URL)
    df_imdb = hf.clean_imdb_data(df_imdb_titles, df_imdb_ratings)
    hf.save_file(df_imdb, PROCESSED_DIR, IMDB_FILE_NAME, CSV_EXT)

    # Get unique genres and storing them in data folder.
    genres = hf.get_unique_genres(df_imdb)
    hf.save_file(genres, PROCESSED_DIR, GENRES_FILE_NAME, PKL_EXT)
예제 #2
0
def process_netflix():
    """
    Processing Netflix dataset: download and parse data, create final
    files and storing them
    """
    # Download the files, unzip them and get the data in a dataframe.
    hf.download_netflix_data(NF_KAGGLE_USER, NF_DIRECTORY)
    list_nf_data = []

    for file in LIST_NF_FILES:
        list_nf_data += hf.parse_data(os.path.join(NF_DIRECTORY, file))
        df_netflix = pd.DataFrame(list_nf_data, columns=DF_NF_COLS)

    # Get the movie recommendation dictionary and store in data folder.
    dict_recommendations = hf.get_recommended_movies(df_netflix)
    hf.save_file(dict_recommendations, PROCESSED_DIR, DICT_NAME, PKL_EXT)

    # Cleaning the movie_titles file
    df_titles = hf.format_movie_titles(TITLES_PATH)
    hf.save_file(df_titles, PROCESSED_DIR, TITLE_FILE_NAME, CSV_EXT)

    #Deleting original Netflix dataset directory
    shutil.rmtree(NF_DIRECTORY)
    acquisition_type = acquisition_data.neurodata_type

    series_name = stim_names[acq_i]
    stimulus_data = input_file.stimulus.get(series_name)

    acq_copy_func = acquisition_copy_functions[acquisition_type]
    stim_copy_func = stimulus_copy_functions[acquisition_type]

    output_acq = acq_copy_func(nwbfile, acquisition_data, series_name,
                               electrode)
    output_stim = stim_copy_func(nwbfile, acquisition_data, stimulus_data,
                                 series_name, electrode)

    nwbfile.add_acquisition(output_acq)
    nwbfile.add_stimulus(output_stim)

    stim_ind += 1

save_file(base_dir + output_filename, nwbfile)
i_io.close()

## Print data of the new file for testing purposes
for acq_name, stim_name in zip(nwbfile.acquisition.keys(),
                               nwbfile.stimulus.keys()):
    print(f'{acq_name} {stim_name}')
    acq_desc = json.loads(nwbfile.acquisition[acq_name].description)
    stim_desc = json.loads(nwbfile.stimulus[stim_name].description)

    for key in acq_desc.keys():
        print(f'{acq_desc[key]} {stim_desc[key]}')