def test_without_category_feature(): number_features = [] category_features = [] sequence_features = [ Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), Sequence('clickedMovieIds', SequenceEncoder(sep='|', min_cnt=1, max_len=5)), Sequence('clickedMovieTopGenres', SequenceEncoder(sep='|', min_cnt=1, max_len=5)) ] features = Features(number_features=number_features, category_features=category_features, sequence_features=sequence_features) wide_features = ['title', 'genres'] deep_features = ['clickedMovieIds', 'clickedMovieTopGenres'] dataloader, _ = prepare_dataloader(features) model = WideDeep(features, wide_features, deep_features, [], num_classes=2, embedding_size=4, hidden_layers=(8, 4), final_activation='sigmoid', dropout=0.3) model(next(iter(dataloader)))
def test_without_number_feature(): number_features = [] category_features = [ Category('userId', CategoryEncoder(min_cnt=1)), Category('movieId', CategoryEncoder(min_cnt=1)), Category('topGenre', CategoryEncoder(min_cnt=1)) ] sequence_features = [ Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), Sequence('clickedMovieIds', SequenceEncoder(sep='|', min_cnt=1, max_len=5)), Sequence('clickedMovieTopGenres', SequenceEncoder(sep='|', min_cnt=1, max_len=5)) ] features = Features(number_features=number_features, category_features=category_features, sequence_features=sequence_features) dataloader = prepare_dataloader(features) model = DNN(features, num_classes=2, embedding_size=4, hidden_layers=(8, 4), final_activation='sigmoid', dropout=0.3) model(next(iter(dataloader)))
def test_normal(): number_features = [ Number('userAge', StandardScaler()), Number('rating', StandardScaler()) ] category_features = [ Category('userId', CategoryEncoder(min_cnt=1)), Category('movieId', CategoryEncoder(min_cnt=1)), Category('topGenre', CategoryEncoder(min_cnt=1)) ] sequence_features = [ Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), Sequence('clickedMovieIds', SequenceEncoder(sep='|', min_cnt=1, max_len=5)), Sequence('clickedMovieTopGenres', SequenceEncoder(sep='|', min_cnt=1, max_len=5)) ] features = Features(number_features=number_features, category_features=category_features, sequence_features=sequence_features) dataloader, _ = prepare_dataloader(features) deep_fm = DeepFM(features, num_classes=2, embedding_size=4, hidden_layers=(8, 4), final_activation='sigmoid', dropout=0.3) deep_fm(next(iter(dataloader)))
def test_normal(): number_features = [ Number('userAge', StandardScaler()), Number('rating', StandardScaler()) ] category_features = [ Category('userId', CategoryEncoder(min_cnt=1)), Category('movieId', CategoryEncoder(min_cnt=1)), Category('topGenre', CategoryEncoder(min_cnt=1)) ] sequence_features = [ Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), Sequence('clickedMovieIds', SequenceEncoder(sep='|', min_cnt=1, max_len=5)), Sequence('clickedMovieTopGenres', SequenceEncoder(sep='|', min_cnt=1, max_len=5)) ] attention_groups = [ AttentionGroup(name='group1', pairs=[{ 'ad': 'movieId', 'pos_hist': 'clickedMovieIds' }, { 'ad': 'topGenre', 'pos_hist': 'clickedMovieTopGenres' }], hidden_layers=[8, 4]) ] features = Features(number_features=number_features, category_features=category_features, sequence_features=sequence_features) dataloader = prepare_dataloader(features) model = DIN(features, attention_groups=attention_groups, num_classes=2, embedding_size=4, hidden_layers=(16, 8), final_activation='sigmoid', dropout=0.3) model(next(iter(dataloader)))
def test_column_flow_define(): number_features = [ Number('age', None), Number('height', ColumnFlow([StandardScaler()])) ] category_features = [ Category('gender', ColumnFlow([CategoryEncoder(min_cnt=1)])) ] sequence_features = [ Sequence('likes', ColumnFlow([SequenceEncoder(sep=' ', min_cnt=1)])) ] features = Features(number_features, category_features, sequence_features) features.fit(__TEST_DATA) actual = features.transform(__TEST_DATA) expected_age = np.array([23, 43, 35, 41, 16, 32, 26, 76]) expected_gender = np.array([2, 2, 1, 2, 1, 1, 2, 2]) expected_height = np.array([ 0.1159659, 0.85814767, -0.99730676, -0.06957954, -1.73948853, -0.34789771, 0.48705679, 1.69310217 ]) assert len(actual) == 4 assert features.number_feature_names() == ['age', 'height'] assert features.category_feature_names() == ['gender'] assert features.sequence_feature_names() == ['likes'] np.testing.assert_array_equal(actual['age'], expected_age) np.testing.assert_array_equal(actual['gender'], expected_gender) np.testing.assert_array_almost_equal(actual['height'], expected_height)
def test_normal(): number_features = [ Number('userAge', StandardScaler()), Number('rating', StandardScaler()) ] category_features = [ Category('userId', CategoryEncoder(min_cnt=1)), Category('movieId', CategoryEncoder(min_cnt=1)), Category('topGenre', CategoryEncoder(min_cnt=1)) ] sequence_features = [ Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), Sequence('clickedMovieIds', SequenceEncoder(sep='|', min_cnt=1, max_len=5)), Sequence('clickedMovieTopGenres', SequenceEncoder(sep='|', min_cnt=1, max_len=5)) ] features = Features(number_features=number_features, category_features=category_features, sequence_features=sequence_features) features.fit(__SAMPLE_DF) X_map = features.transform(__SAMPLE_DF) dataset = Dataset(features, X_map, __SAMPLE_DF.label.values) assert dataset[0]['userId'] == 1 assert dataset[0]['movieId'] == 1 assert dataset[0]['genres'].tolist() == [8, 9, 0, 0] assert dataset[0]['__genres_length'] == 2 assert dataset[0]['label'] == 1
def create_test_data(): number_features = [ Number('userAge', StandardScaler()), Number('rating', StandardScaler()) ] category_features = [ Category('userId', CategoryEncoder(min_cnt=1)), Category('movieId', CategoryEncoder(min_cnt=1)), Category('topGenre', CategoryEncoder(min_cnt=1)) ] sequence_features = [ Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), Sequence('clickedMovieIds', SequenceEncoder(sep='|', min_cnt=1, max_len=5)), Sequence('clickedMovieTopGenres', SequenceEncoder(sep='|', min_cnt=1, max_len=5)), Sequence('noClickedMovieIds', SequenceEncoder(sep='|', min_cnt=1, max_len=5)), Sequence('noClickedMovieTopGenres', SequenceEncoder(sep='|', min_cnt=1, max_len=5)) ] attention_groups = [ AttentionGroup(name='group1', pairs=[{ 'ad': 'movieId', 'pos_hist': 'clickedMovieIds', 'neg_hist': 'noClickedMovieIds' }, { 'ad': 'topGenre', 'pos_hist': 'clickedMovieTopGenres', 'neg_hist': 'noClickedMovieTopGenres' }], hidden_layers=[8, 4]) ] features = Features(number_features=number_features, category_features=category_features, sequence_features=sequence_features) dataloader = prepare_dataloader(features) return dataloader, features, attention_groups
def create_test_data_with_sharing_emb(): number_features = [ Number('userAge', StandardScaler()), Number('rating', StandardScaler()) ] # provide word to index mapping movie_word2idx = { '__PAD__': 0, '4226': 1, '5971': 2, '6291': 3, '7153': 4, '30707': 5, '3242': 6, '42': 7, '32': 8, '34': 9, '233': 10, '291': 11, '324': 12, '325': 13, '3542': 14, '322': 15, '33': 16, '45': 17, '__UNKNOWN__': 18 } movie_idx2word = {index: word for word, index in movie_word2idx.items()} category_features = [ Category( 'movieId', CategoryEncoder(word2idx=movie_word2idx, idx2word=movie_idx2word)), Category('topGenre', CategoryEncoder(min_cnt=1)) ] sequence_features = [ Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), Sequence( 'clickedMovieIds', SequenceEncoder(sep='|', max_len=5, word2idx=movie_word2idx, idx2word=movie_idx2word)), Sequence( 'noClickedMovieIds', SequenceEncoder(sep='|', max_len=5, word2idx=movie_word2idx, idx2word=movie_idx2word)) ] attention_groups = [ AttentionGroup(name='group1', pairs=[{ 'ad': 'movieId', 'pos_hist': 'clickedMovieIds', 'neg_hist': 'noClickedMovieIds' }], hidden_layers=[8, 4]) ] embedding_ref = EmbeddingRef({ 'clickedMovieIds': 'movieId', 'noClickedMovieIds': 'movieId' }) features = Features(number_features=number_features, category_features=category_features, sequence_features=sequence_features) dataloader = prepare_dataloader(features) return dataloader, features, attention_groups, embedding_ref
def test_shared_embedding(): number_features = [] movie_enc = SequenceEncoder(sep='|', min_cnt=1, max_len=5) genre_enc = SequenceEncoder(sep='|', min_cnt=1, max_len=5) movie_enc.fit( np.concatenate( (_SAMPLE_DF.clickedMovieIds.values, _SAMPLE_DF.movieId.values), axis=None)) genre_enc.fit( np.concatenate((_SAMPLE_DF.clickedMovieTopGenres.values, _SAMPLE_DF.topGenre.values), axis=None)) category_features = [ Category('userId', CategoryEncoder(min_cnt=1)), Category('movieId', CategoryEncoder(min_cnt=1, word2idx=movie_enc.word2idx, idx2word=movie_enc.idx2word), embedding_name='movieId'), Category('topGenre', CategoryEncoder(min_cnt=1, word2idx=genre_enc.word2idx, idx2word=genre_enc.idx2word), embedding_name='topGenre', embedding_size=8) ] sequence_features = [ Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), Sequence('clickedMovieIds', SequenceEncoder(sep='|', min_cnt=1, max_len=5, word2idx=movie_enc.word2idx, idx2word=movie_enc.idx2word), embedding_name='movieId'), Sequence('clickedMovieTopGenres', SequenceEncoder(sep='|', min_cnt=1, max_len=5, word2idx=genre_enc.word2idx, idx2word=genre_enc.idx2word), embedding_name='topGenre', embedding_size=8) ] features = Features(number_features=number_features, category_features=category_features, sequence_features=sequence_features) dataloader, _ = prepare_dataloader(features) model = DNN(features, num_classes=2, embedding_size=16, hidden_layers=(8, 4), final_activation='sigmoid', dropout=0.3) model(next(iter(dataloader)))