def test_with_modalities(self): data = Reader().read("./tests/data.txt") sentiment_data = Reader().read("./tests/sentiment_data.txt", fmt="UITup", sep=",", tup_sep=":") bm = BaseMethod.from_splits(train_data=data[:-1], test_data=data[-1:]) self.assertIsNone(bm.user_text) self.assertIsNone(bm.item_text) self.assertIsNone(bm.user_image) self.assertIsNone(bm.item_image) self.assertIsNone(bm.user_graph) self.assertIsNone(bm.item_graph) self.assertIsNone(bm.sentiment) bm.user_text = TextModality() bm.item_image = ImageModality() bm.sentiment = SentimentModality(data=sentiment_data) bm._build_modalities() try: bm.user_text = ImageModality() except ValueError: assert True try: bm.item_text = ImageModality() except ValueError: assert True try: bm.user_image = TextModality() except ValueError: assert True try: bm.item_image = TextModality() except ValueError: assert True try: bm.user_graph = TextModality() except ValueError: assert True try: bm.item_graph = ImageModality() except ValueError: assert True try: bm.sentiment = TextModality() except ValueError: assert True try: bm.sentiment = ImageModality() except ValueError: assert True
def setUp(self): self.tokens = ['a', 'b', 'c', 'd', 'e', 'f'] corpus = ['a b c', 'b c d d', 'c b e c f'] ids = ['u1', 'u2', 'u3'] # frequency ranking: c > b > d > a > e > f self.modality = TextModality(corpus=corpus, ids=ids, max_vocab=6) self.modality.build({'u1': 0, 'u2': 1, 'u3': 2}) self.token_ids = (self.modality.vocab.tok2idx[tok] for tok in self.tokens)
def test_with_modalities(self): bm = BaseMethod() self.assertIsNone(bm.user_text) self.assertIsNone(bm.item_text) self.assertIsNone(bm.user_image) self.assertIsNone(bm.item_image) self.assertIsNone(bm.user_graph) self.assertIsNone(bm.item_graph) bm.user_text = TextModality() bm.item_image = ImageModality() bm._build_modalities() try: bm.user_text = ImageModality() except ValueError: assert True try: bm.item_text = ImageModality() except ValueError: assert True try: bm.user_image = TextModality() except ValueError: assert True try: bm.item_image = TextModality() except ValueError: assert True try: bm.user_graph = TextModality() except ValueError: assert True try: bm.item_graph = ImageModality() except ValueError: assert True
def test_tfidf_params(self): corpus = ['a b c', 'b c d d', 'c b e c f'] ids = ['u1', 'u2', 'u3'] modality = TextModality(corpus=corpus, ids=ids, max_vocab=6, tfidf_params={ 'binary': False, 'norm': 'l2', 'use_idf': True, 'smooth_idf': True, 'sublinear_tf': False }).build({ 'u1': 0, 'u2': 1, 'u3': 2 }) npt.assert_array_equal(modality.batch_tfidf([1]), self.modality.batch_tfidf([1])) for k, v in { 'binary': True, 'norm': 'l1', 'use_idf': False, 'smooth_idf': False, 'sublinear_tf': True }.items(): modality = TextModality(corpus=corpus, ids=ids, max_vocab=6, tfidf_params={k: v}) modality.build({'u1': 0, 'u2': 1, 'u3': 2}) self.assertFalse( np.array_equal(modality.batch_tfidf([1]), self.modality.batch_tfidf([1])))
"""Example for HFT with Movilen 1m dataset """ import cornac from cornac.data import Reader from cornac.datasets import movielens from cornac.eval_methods import RatioSplit from cornac.data import TextModality from cornac.data.text import BaseTokenizer plots, movie_ids = movielens.load_plot() ml_1m = movielens.load_1m(reader=Reader(item_set=movie_ids)) # build text module item_text_modality = TextModality(corpus=plots, ids=movie_ids, tokenizer=BaseTokenizer( sep='\t', stop_words='english'), max_vocab=5000, max_doc_freq=0.5) ratio_split = RatioSplit(data=ml_1m, test_size=0.2, exclude_unknowns=True, item_text=item_text_modality, verbose=True, seed=123) hft = cornac.models.HFT(k=10, max_iter=40, grad_iter=5, l2_reg=0.001, lambda_text=0.01,
"""Example for Collaborative Deep Ranking""" import cornac from cornac.data import Reader from cornac.datasets import citeulike from cornac.eval_methods import RatioSplit from cornac.data import TextModality from cornac.data.text import BaseTokenizer docs, item_ids = citeulike.load_text() data = citeulike.load_data(reader=Reader(item_set=item_ids)) # build text module item_text_modality = TextModality( corpus=docs, ids=item_ids, tokenizer=BaseTokenizer(stop_words='english'), max_vocab=8000, max_doc_freq=0.5) ratio_split = RatioSplit(data=data, test_size=0.2, exclude_unknowns=True, item_text=item_text_modality, verbose=True, seed=123, rating_threshold=0.5) cdr = cornac.models.CDR(k=50, autoencoder_structure=[200], max_iter=100, batch_size=128,
def test_batch_bow_fallback(self): modality = TextModality(features=np.asarray([[3, 2, 1], [4, 5, 6]]), ids=['a', 'b']) modality.build() npt.assert_array_equal(np.asarray([[3, 2, 1]]), modality.batch_bow(batch_ids=[0]))
def test_build(self): TextModality().build() TextModality(corpus=['abc']).build() TextModality(corpus=['abc']).build({'b': 0}) TextModality(corpus=['abc'], ids=['a']).build({'b': 0})
class TestTextModality(unittest.TestCase): def setUp(self): self.tokens = ['a', 'b', 'c', 'd', 'e', 'f'] corpus = ['a b c', 'b c d d', 'c b e c f'] ids = ['u1', 'u2', 'u3'] # frequency ranking: c > b > d > a > e > f self.modality = TextModality(corpus=corpus, ids=ids, max_vocab=6) self.modality.build({'u1': 0, 'u2': 1, 'u3': 2}) self.token_ids = (self.modality.vocab.tok2idx[tok] for tok in self.tokens) def test_init(self): self.assertCountEqual(self.modality.vocab.idx2tok, SPECIAL_TOKENS + self.tokens) def test_build(self): TextModality().build() TextModality(corpus=['abc']).build() TextModality(corpus=['abc']).build({'b': 0}) TextModality(corpus=['abc'], ids=['a']).build({'b': 0}) def test_sequences(self): (a, b, c, d, e, f) = self.token_ids self.assertListEqual(self.modality.sequences, [[a, b, c], [b, c, d, d], [c, b, e, c, f]]) def test_batch_seq(self): (a, b, c, d, e, f) = self.token_ids batch_seqs = self.modality.batch_seq([2, 1]) self.assertEqual((2, 5), batch_seqs.shape) npt.assert_array_equal(batch_seqs, np.asarray([[c, b, e, c, f], [b, c, d, d, 0]])) batch_seqs = self.modality.batch_seq([0, 2], max_length=4) self.assertEqual((2, 4), batch_seqs.shape) npt.assert_array_equal(batch_seqs, np.asarray([[a, b, c, 0], [c, b, e, c]])) self.modality.sequences = None try: self.modality.batch_seq([0]) except ValueError: assert True def test_count_matrix(self): (a, b, c, d, e, f) = self.token_ids shift = len(SPECIAL_TOKENS) expected_counts = np.zeros_like(self.modality.count_matrix.A) expected_counts[0, a - shift] = 1 expected_counts[0, b - shift] = 1 expected_counts[0, c - shift] = 1 expected_counts[1, b - shift] = 1 expected_counts[1, c - shift] = 1 expected_counts[1, d - shift] = 2 expected_counts[2, b - shift] = 1 expected_counts[2, c - shift] = 2 expected_counts[2, e - shift] = 1 expected_counts[2, f - shift] = 1 npt.assert_array_equal(self.modality.count_matrix.A, expected_counts) def test_batch_bow(self): (a, b, c, d, e, f) = self.token_ids shift = len(SPECIAL_TOKENS) batch_bows = self.modality.batch_bow([2, 1]) self.assertEqual((2, self.modality.max_vocab), batch_bows.shape) expected_bows = np.zeros_like(batch_bows) expected_bows[0, b - shift] = 1 expected_bows[0, c - shift] = 2 expected_bows[0, e - shift] = 1 expected_bows[0, f - shift] = 1 expected_bows[1, b - shift] = 1 expected_bows[1, c - shift] = 1 expected_bows[1, d - shift] = 2 npt.assert_array_equal(batch_bows, expected_bows) batch_bows = self.modality.batch_bow([0, 2], binary=True, keep_sparse=True) self.assertEqual((2, 6), batch_bows.shape) expected_bows = np.zeros_like(batch_bows.A) expected_bows[0, np.asarray([a, b, c]) - shift] = 1 expected_bows[1, np.asarray([b, c, e, f]) - shift] = 1 npt.assert_array_equal(batch_bows.A, expected_bows) self.modality.count_matrix = None try: self.modality.batch_bow([0]) except ValueError: assert True def test_batch_bow_fallback(self): modality = TextModality(features=np.asarray([[3, 2, 1], [4, 5, 6]]), ids=['a', 'b']) modality.build() npt.assert_array_equal(np.asarray([[3, 2, 1]]), modality.batch_bow(batch_ids=[0]))
from cornac.eval_methods import RatioSplit from cornac.data import TextModality from cornac.data.text import BaseTokenizer # ConvMF extends matrix factorization to leverage item textual information # The necessary data can be loaded as follows plots, movie_ids = movielens.load_plot() ml_1m = movielens.load_feedback(variant="1M", reader=Reader(item_set=movie_ids)) # Instantiate a TextModality, it makes it convenient to work with text auxiliary information # For more details, please refer to the tutorial on how to work with auxiliary data item_text_modality = TextModality( corpus=plots, ids=movie_ids, tokenizer=BaseTokenizer(sep="\t", stop_words="english"), max_vocab=8000, max_doc_freq=0.5, ) # Define an evaluation method to split feedback into train and test sets ratio_split = RatioSplit( data=ml_1m, test_size=0.2, exclude_unknowns=True, item_text=item_text_modality, verbose=True, seed=123, ) # Instantiate ConvMF model