def read_data(path): fmt = validate_format("UIR", ["UIR", "UIRT"]) # HARDCODE UIR reader = None reader = Reader() if reader is None else reader data = reader.read(path) return data
def test_with_modalities(self): data = Reader().read("./tests/data.txt") sentiment_data = Reader().read("./tests/sentiment_data.txt", fmt="UITup", sep=",", tup_sep=":") bm = BaseMethod.from_splits(train_data=data[:-1], test_data=data[-1:]) self.assertIsNone(bm.user_text) self.assertIsNone(bm.item_text) self.assertIsNone(bm.user_image) self.assertIsNone(bm.item_image) self.assertIsNone(bm.user_graph) self.assertIsNone(bm.item_graph) self.assertIsNone(bm.sentiment) bm.user_text = TextModality() bm.item_image = ImageModality() bm.sentiment = SentimentModality(data=sentiment_data) bm._build_modalities() try: bm.user_text = ImageModality() except ValueError: assert True try: bm.item_text = ImageModality() except ValueError: assert True try: bm.user_image = TextModality() except ValueError: assert True try: bm.item_image = TextModality() except ValueError: assert True try: bm.user_graph = TextModality() except ValueError: assert True try: bm.item_graph = ImageModality() except ValueError: assert True try: bm.sentiment = TextModality() except ValueError: assert True try: bm.sentiment = ImageModality() except ValueError: assert True
def load_feedback(variant="closed_loop", reader=None): """Load the user-item ratings of one of the Yahoo Music datasets Parameters ---------- variant: str, optional, default: 'closed_loop' Specifies which Yahoo Music dataset to load, one of ['closed_loop', 'open_loop']. reader: `obj:cornac.data.Reader`, optional, default: None Reader object used to read the data. Returns ------- data: array-like Data in the form of a list of tuples depending on the given data format. """ yah = YAHOO_DATASETS.get(variant.upper(), None) if yah is None: raise ValueError("variant must be one of {}.".format( YAHOO_DATASETS.keys())) fpath = cache(url=yah.url, unzip=yah.unzip, relative_path=yah.path) reader = Reader() if reader is None else reader return reader.read(fpath, 'UIR', sep=yah.sep, skip_lines=yah.skip)
class TestReader(unittest.TestCase): def setUp(self): self.data_file = './tests/data.txt' self.reader = Reader() def test_raise(self): try: self.reader.read(self.data_file, fmt='bla bla') except ValueError: assert True def test_read_ui(self): triplets = self.reader.read(self.data_file, fmt='UI') self.assertEqual(len(triplets), 30) self.assertEqual(triplets[0][1], '93') self.assertEqual(triplets[1][2], 1.0) triplets = self.reader.read(self.data_file, fmt='UI', id_inline=True) self.assertEqual(len(triplets), 40) def test_read_uir(self): triplet_data = self.reader.read(self.data_file) self.assertEqual(len(triplet_data), 10) self.assertEqual(triplet_data[4][2], 3) self.assertEqual(triplet_data[6][1], '478') self.assertEqual(triplet_data[8][0], '543') def test_filter(self): reader = Reader(bin_threshold=4.0) self.assertEqual(len(reader.read(self.data_file)), 8) reader = Reader(min_user_freq=2) self.assertEqual(len(reader.read(self.data_file)), 0) reader = Reader(min_item_freq=2) self.assertEqual(len(reader.read(self.data_file)), 0) reader = Reader(user_set=['76'], item_set=['93']) self.assertEqual(len(reader.read(self.data_file)), 1) reader = Reader(user_set=['76', '768']) self.assertEqual(len(reader.read(self.data_file)), 2) reader = Reader(item_set=['93', '257', '795']) self.assertEqual(len(reader.read(self.data_file)), 3) def test_read_text(self): self.assertEqual(len(read_text(self.data_file, sep=None)), 10) self.assertEqual(read_text(self.data_file, sep='\t')[1][0], '76')
def test_build(self): data = Reader().read('./tests/sentiment_data.txt', fmt='UITup', sep=',', tup_sep=':') md = SentimentModality(data=data) uid_map = OrderedDict() iid_map = OrderedDict() for raw_uid, raw_iid, _ in data: uid_map.setdefault(raw_uid, len(uid_map)) iid_map.setdefault(raw_iid, len(iid_map)) matrix = dok_matrix((len(uid_map), len(iid_map)), dtype=np.float32) for raw_uid, raw_iid, _ in data: user_idx = uid_map.get(raw_uid) item_idx = iid_map.get(raw_iid) matrix[user_idx, item_idx] = 1 md.build(uid_map=uid_map, iid_map=iid_map, dok_matrix=matrix) self.assertEqual(md.num_aspects, 3) self.assertEqual(md.num_opinions, 2) self.assertEqual(len(md.sentiment), 4) self.assertEqual(len(md.user_sentiment), 3) self.assertEqual(len(md.item_sentiment), 3) self.assertEqual(len(md.aspect_id_map), 3) self.assertEqual(len(md.opinion_id_map), 2) try: SentimentModality().build() except ValueError: assert True
def test_get_node_degree(self): data = Reader().read('./tests/graph_data.txt', sep=' ') gmd = GraphModality(data=data) global_iid_map = OrderedDict() for raw_iid, raw_jid, val in data: global_iid_map.setdefault(raw_iid, len(global_iid_map)) gmd.build(id_map=global_iid_map) degree = gmd.get_node_degree() self.assertEqual(degree.get(0)[0], 4) self.assertEqual(degree.get(0)[1], 1) self.assertEqual(degree.get(1)[0], 2) self.assertEqual(degree.get(1)[1], 1) self.assertEqual(degree.get(5)[0], 0) self.assertEqual(degree.get(5)[1], 1) degree = gmd.get_node_degree([0, 1], [0, 1]) self.assertEqual(degree.get(0)[0], 1) self.assertEqual(degree.get(0)[1], 0) self.assertEqual(degree.get(1)[0], 0) self.assertEqual(degree.get(1)[1], 1)
def test_testset_none(self): bm = BaseMethod(None, verbose=True) bm.train_set = Dataset.from_uir(data=Reader().read("./tests/data.txt")) try: bm.evaluate(None, {}, False) except ValueError: assert True
def test_from_splits(self): data = Reader().read("./tests/data.txt") try: BaseMethod.from_splits(train_data=None, test_data=None) except ValueError: assert True try: BaseMethod.from_splits(train_data=data, test_data=None) except ValueError: assert True try: BaseMethod.from_splits(train_data=data, test_data=[], exclude_unknowns=True) except ValueError: assert True bm = BaseMethod.from_splits(train_data=data[:-1], test_data=data[-1:]) self.assertEqual(bm.total_users, 10) self.assertEqual(bm.total_items, 10) bm = BaseMethod.from_splits( train_data=data[:-1], test_data=data[-1:], val_data=[(data[0][0], data[1][1], 5.0)], verbose=True, ) self.assertEqual(bm.total_users, 10) self.assertEqual(bm.total_items, 10)
def test_testset(): """Test TestSet""" data_file = './tests/data.txt' u_col = 0 i_col = 1 r_col = 2 sep = '\t' triplet_data = Reader.read_uir_triplets(data_file, u_col, i_col, r_col, sep, skip_lines=0) test_set = TestSet.from_uir_triplets(triplet_data, pre_uid_map={}, pre_iid_map={}, pre_ui_set=set()) assert test_set.get_uid('768') == 1 assert test_set.get_iid('195') == 7 assert all([a == b for a, b in zip(test_set.get_users(), range(10))]) assert all([a == b for a, b in zip(test_set.get_ratings(2), [(2, 4)])]) test_set = TestSet.from_uir_triplets(triplet_data, pre_uid_map={}, pre_iid_map={}, pre_ui_set=set([('76', '93')]), verbose=True) assert len(test_set.get_users()) == 9
def test_data_fmt(self): try: StratifiedSplit(self.data, fmt="UIR", chrono=True, verbose=True) except ValueError: assert True try: data = Reader().read("./tests/data.txt", fmt="UIR", sep="\t") StratifiedSplit(data, chrono=True, verbose=True) except ValueError: assert True
def test_txt_to_triplets(): """Test txt_to_triplets function""" data_file = './tests/data.txt' u_col = 0 i_col = 1 r_col = 2 sep = '\t' triplet_data = Reader.read_uir_triplets(data_file, u_col, i_col, r_col, sep, skip_lines=0) assert len(triplet_data) == 10 assert triplet_data[4][2] == 3 assert triplet_data[6][1] == '478' assert triplet_data[8][0] == '543' try: Reader.read_uir_triplets(data_file, 10) except IndexError: assert True
def test_from_provided(): data_file = './tests/data.txt' data = Reader.read_uir_triplets(data_file) try: BaseMethod.from_provided(train_data=None, test_data=None) except ValueError: assert True try: BaseMethod.from_provided(train_data=data, test_data=None) except ValueError: assert True bm = BaseMethod.from_provided(train_data=data, test_data=data) assert bm.total_users == 10 assert bm.total_items == 10
def test_get_train_triplet(self): data = Reader().read('./tests/graph_data.txt', sep=' ') gmd = GraphModule(data=data) global_iid_map = OrderedDict() for raw_iid, raw_jid, val in data: global_iid_map.setdefault(raw_iid, len(global_iid_map)) gmd.build(id_map=global_iid_map) rid, cid, val = gmd.get_train_triplet([0, 1, 2], [0, 1, 2]) self.assertEqual(len(rid), 3) self.assertEqual(len(cid), 3) self.assertEqual(len(val), 3) rid, cid, val = gmd.get_train_triplet([0, 2], [0, 1]) self.assertEqual(len(rid), 1) self.assertEqual(len(cid), 1) self.assertEqual(len(val), 1)
def test_init(self): triplet_data = Reader().read('./tests/data.txt') test_set = TestSet.from_uir(triplet_data, global_uid_map={}, global_iid_map={}, global_ui_set=set()) self.assertEqual(test_set.get_uid('768'), 1) self.assertEqual(test_set.get_iid('195'), 7) self.assertSequenceEqual(test_set.users, range(10)) self.assertListEqual(test_set.get_ratings(2), [(2, 4)]) test_set = TestSet.from_uir(triplet_data, global_uid_map=OrderedDict(), global_iid_map=OrderedDict(), global_ui_set=set([('76', '93')]), verbose=True) self.assertEqual(len(test_set.users), 9)
def test_matrix_trainset_uir_iter(): data_file = './tests/data.txt' triplet_data = Reader.read_uir_triplets(data_file) train_set = MatrixTrainSet.from_uir_triplets(triplet_data, pre_uid_map={}, pre_iid_map={}, pre_ui_set=set(), verbose=True) users = [batch_users for batch_users, _, _ in train_set.uir_iter()] assert all([a == b for a, b in zip(users, range(10))]) items = [batch_items for _, batch_items, _ in train_set.uir_iter()] assert all([a == b for a, b in zip(items, range(10))]) ratings = [batch_ratings for _, _, batch_ratings in train_set.uir_iter()] assert all( [a == b for a, b in zip(ratings, [4, 4, 4, 4, 3, 4, 4, 5, 3, 4])])
def test_build(self): data = Reader().read('./tests/graph_data.txt', sep=' ') gmd = GraphModule(data=data) global_iid_map = OrderedDict() for raw_iid, raw_jid, val in data: global_iid_map.setdefault(raw_iid, len(global_iid_map)) gmd.build(id_map=global_iid_map) self.assertEqual(len(gmd.map_rid), 7) self.assertEqual(len(gmd.map_cid), 7) self.assertEqual(len(gmd.val), 7) self.assertEqual(gmd.matrix.shape, (7, 7)) try: GraphModule().build() except ValueError: assert True
def test_from_splits(self): data = Reader().read('./tests/data.txt') try: BaseMethod.from_splits(train_data=None, test_data=None) except ValueError: assert True try: BaseMethod.from_splits(train_data=data, test_data=None) except ValueError: assert True bm = BaseMethod.from_splits(train_data=data, test_data=data) self.assertEqual(bm.total_users, 10) self.assertEqual(bm.total_items, 10) bm = BaseMethod.from_splits(train_data=data, test_data=data, val_data=data, verbose=True) self.assertEqual(bm.total_users, 10) self.assertEqual(bm.total_items, 10)
def test_matrix_trainset_uij_iter(): data_file = './tests/data.txt' triplet_data = Reader.read_uir_triplets(data_file) train_set = MatrixTrainSet.from_uir_triplets(triplet_data, pre_uid_map={}, pre_iid_map={}, pre_ui_set=set(), verbose=True) users = [batch_users for batch_users, _, _ in train_set.uij_iter()] assert all([a == b for a, b in zip(users, range(10))]) pos_items = [ batch_pos_items for _, batch_pos_items, _ in train_set.uij_iter() ] assert all([a == b for a, b in zip(pos_items, range(10))]) neg_items = [ batch_neg_items for _, _, batch_neg_items in train_set.uij_iter() ] assert all([a != b for a, b in zip(neg_items, range(10))])
def test_evaluate(self): data = Reader().read('./tests/data.txt') bm = BaseMethod.from_splits(train_data=data, test_data=data) model = MF(k=1, max_iter=0) result = bm.evaluate(model, metrics=[MAE()], user_based=False) result.__str__()
def setUp(self): self.triplet_data = Reader().read('./tests/data.txt')
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ import cornac from cornac.eval_methods import RatioSplit from cornac.datasets import amazon_clothing from cornac.data import Reader # Load the Amazon Clothing dataset, and binarise ratings using cornac.data.Reader feedback = amazon_clothing.load_feedback(reader=Reader(bin_threshold=1.0)) # Define an evaluation method to split feedback into train and test sets ratio_split = RatioSplit(data=feedback, test_size=0.2, rating_threshold=1.0, seed=123, exclude_unknowns=True, verbose=True) # Instantiate the recommender models to be compared gmf = cornac.models.GMF(num_factors=8, num_epochs=10, learner='adam', batch_size=256, lr=0.001,
def setUp(self): self.triplet_data = Reader().read('./tests/data.txt') self.uirt_data = Reader().read('./tests/data.txt', fmt='UIRT')
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """Example for HFT with Movilen 1m dataset """ import cornac from cornac.data import Reader from cornac.datasets import movielens from cornac.eval_methods import RatioSplit from cornac.data import TextModality from cornac.data.text import BaseTokenizer plots, movie_ids = movielens.load_plot() ml_1m = movielens.load_1m(reader=Reader(item_set=movie_ids)) # build text module item_text_modality = TextModality(corpus=plots, ids=movie_ids, tokenizer=BaseTokenizer( sep='\t', stop_words='english'), max_vocab=5000, max_doc_freq=0.5) ratio_split = RatioSplit(data=ml_1m, test_size=0.2, exclude_unknowns=True, item_text=item_text_modality, verbose=True, seed=123)
def setUp(self): data = Reader().read("./tests/data.txt") self.eval_method = RatioSplit(data, test_size=0.2, val_size=0.2, exclude_unknowns=False)
# See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """Example for Collaborative Deep Ranking (CDR)""" import cornac from cornac.data import Reader from cornac.datasets import citeulike from cornac.eval_methods import RatioSplit from cornac.data import TextModality from cornac.data.text import BaseTokenizer # CDR composes an autoencoder with a ranking collaborative model to represent item texts and user-item interactions # The necessary data can be loaded as follows docs, item_ids = citeulike.load_text() feedback = citeulike.load_feedback(reader=Reader(item_set=item_ids)) # Instantiate a TextModality, it makes it convenient to work with text auxiliary information # For more details, please refer to the tutorial on how to work with auxiliary data item_text_modality = TextModality( corpus=docs, ids=item_ids, tokenizer=BaseTokenizer(stop_words="english"), max_vocab=8000, max_doc_freq=0.5, ) # Define an evaluation method to split feedback into train and test sets ratio_split = RatioSplit( data=feedback, test_size=0.2,
def setUp(self): self.data = Reader().read('./tests/data.txt') self.n_folds = 5 self.cv = CrossValidation(data=self.data, n_folds=self.n_folds, exclude_unknowns=False)
def test_filter(self): reader = Reader(bin_threshold=4.0) data = reader.read(self.data_file) self.assertEqual(len(data), 8) self.assertListEqual([x[2] for x in data], [1] * len(data)) reader = Reader(min_user_freq=2) self.assertEqual(len(reader.read(self.data_file)), 0) reader = Reader(min_item_freq=2) self.assertEqual(len(reader.read(self.data_file)), 0) reader = Reader(user_set=['76'], item_set=['93']) self.assertEqual(len(reader.read(self.data_file)), 1) reader = Reader(user_set=['76', '768']) self.assertEqual(len(reader.read(self.data_file)), 2) reader = Reader(item_set=['93', '257', '795']) self.assertEqual(len(reader.read(self.data_file)), 3)
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """Example for Bayesian Personalized Ranking with Netflix dataset""" import cornac from cornac.data import Reader from cornac.datasets import netflix from cornac.eval_methods import RatioSplit ratio_split = RatioSplit( data=netflix.load_data_small(reader=Reader(bin_threshold=1.0)), test_size=0.1, rating_threshold=1.0, exclude_unknowns=True, verbose=True) most_pop = cornac.models.MostPop() bpr = cornac.models.BPR(k=10, max_iter=100, learning_rate=0.001, lambda_reg=0.01, verbose=True) auc = cornac.metrics.AUC() rec_20 = cornac.metrics.Recall(k=20)
# -*- coding: utf-8 -*- """ Example for Collaborative Deep Ranking @author: Tran Thanh Binh """ import cornac from cornac.data import Reader from cornac.datasets import citeulike from cornac.eval_methods import RatioSplit from cornac.data import TextModule from cornac.data.text import BaseTokenizer docs, item_ids = citeulike.load_text() data = citeulike.load_data(reader=Reader(item_set=item_ids)) # build text module item_text_module = TextModule(corpus=docs, ids=item_ids, tokenizer=BaseTokenizer('\t'), max_vocab=8000, max_doc_freq=0.5, stop_words='english') ratio_split = RatioSplit(data=data, test_size=0.2, exclude_unknowns=True, item_text=item_text_module, verbose=True, seed=123,
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """Example for Social Bayesian Personalized Ranking with Epinions dataset""" import cornac from cornac.data import Reader, GraphModule from cornac.datasets import epinions from cornac.eval_methods import RatioSplit ratio_split = RatioSplit(data=epinions.load_data(Reader(bin_threshold=4.0)), test_size=0.1, rating_threshold=0.5, exclude_unknowns=True, verbose=True, user_graph=GraphModule(data=epinions.load_trust())) sbpr = cornac.models.SBPR(k=10, max_iter=50, learning_rate=0.001, lambda_u=0.015, lambda_v=0.025, lambda_b=0.01, verbose=True) rec_10 = cornac.metrics.Recall(k=10)