ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp'] # data = ratings.merge(users, on=['UserID']) # data = data.merge(movies, on=['MovieID']) # %% # movies数据处理 movies['publish_date'] = movies.Title.str[-5:-1].astype(int) movies['Title'] = LabelEncoder().fit_transform(movies.Title.str[:-7]) from sklearn.preprocessing import MultiLabelBinarizer movie_genres = MultiLabelBinarizer().fit_transform( movies.Genres.map(lambda x: x.split('|'))) movie_genres = pd.DataFrame(movie_genres) movie_genres.columns = [ 'Genres_%d' % i for i in range(len(movie_genres.columns)) ] movies = pd.concat([movies, movie_genres], axis=1) # users数据处理 users = users.drop(['Zip-code'], axis=1) users['Gender'] = LabelEncoder().fit_transform(users.Gender) # ratings数据处理 ratings = ratings.sort_values(['UserID', 'Timestamp']) ratings['MovieID'] = ratings['MovieID'].astype(str) watching_seq = ratings.groupby('UserID')['MovieID'].transform( lambda x: ','.join(x)) ratings['MovieID'] = ratings['MovieID'].astype(int) ratings['watching_seq'] = ratings['UserID'].map(watching_seq) dt = pd.to_datetime(ratings.Timestamp).dt ratings['day'] = dt.day