Пример #1
0
    def import_dataset(self, dataset: MovieLensDataSet):
        genres: dict[str, Genre] = {
            genre.name: genre
            for genre in Genre.objects.all()
        }

        movies_added = 0
        genres_added = 0
        users_added = 0
        ratings_added = 0
        tags_added = 0

        for raw_movies in chunked(read_csv_dicts(dataset.movies, unit='movie'), 500):
            movies = []
            genres_encountered = set()
            for raw_movie in raw_movies:
                raw_title = raw_movie['title'].strip()
                if match := re.match(r'^(.+) \((\d{4})\)$', raw_title):
                    title, year = match.groups()
                else:
                    title, year = raw_title, None

                movie_id = int(raw_movie['movieId'])
                movie_genres = set(raw_movie['genres'].split('|'))
                genres_encountered |= movie_genres

                movie = Movie(id=movie_id, title=title, year=year)
                movie._raw_genres = movie_genres

                movies.append(movie)

            movies_added += len(Movie.objects.bulk_create(movies, ignore_conflicts=True))

            if missing_genre_names := genres_encountered - set(genres):
                missing_genres: list[Genre] = []

                for genre_name in missing_genre_names:
                    genre = Genre(name=genre_name)
                    genres[genre_name] = genre
                    missing_genres.append(genre)

                genres_added += len(Genre.objects.bulk_create(missing_genres))