def import_dataset(self, dataset: MovieLensDataSet): genres: dict[str, Genre] = { genre.name: genre for genre in Genre.objects.all() } movies_added = 0 genres_added = 0 users_added = 0 ratings_added = 0 tags_added = 0 for raw_movies in chunked(read_csv_dicts(dataset.movies, unit='movie'), 500): movies = [] genres_encountered = set() for raw_movie in raw_movies: raw_title = raw_movie['title'].strip() if match := re.match(r'^(.+) \((\d{4})\)$', raw_title): title, year = match.groups() else: title, year = raw_title, None movie_id = int(raw_movie['movieId']) movie_genres = set(raw_movie['genres'].split('|')) genres_encountered |= movie_genres movie = Movie(id=movie_id, title=title, year=year) movie._raw_genres = movie_genres movies.append(movie) movies_added += len(Movie.objects.bulk_create(movies, ignore_conflicts=True)) if missing_genre_names := genres_encountered - set(genres): missing_genres: list[Genre] = [] for genre_name in missing_genre_names: genre = Genre(name=genre_name) genres[genre_name] = genre missing_genres.append(genre) genres_added += len(Genre.objects.bulk_create(missing_genres))