Exemplo n.º 1
0
def read_data(path):
    fmt = validate_format("UIR", ["UIR", "UIRT"]) # HARDCODE UIR 
    reader = None
    reader = Reader() if reader is None else reader
    data = reader.read(path)

    return data
Exemplo n.º 2
0
    def test_with_modalities(self):
        data = Reader().read("./tests/data.txt")
        sentiment_data = Reader().read("./tests/sentiment_data.txt",
                                       fmt="UITup",
                                       sep=",",
                                       tup_sep=":")
        bm = BaseMethod.from_splits(train_data=data[:-1], test_data=data[-1:])

        self.assertIsNone(bm.user_text)
        self.assertIsNone(bm.item_text)
        self.assertIsNone(bm.user_image)
        self.assertIsNone(bm.item_image)
        self.assertIsNone(bm.user_graph)
        self.assertIsNone(bm.item_graph)
        self.assertIsNone(bm.sentiment)

        bm.user_text = TextModality()
        bm.item_image = ImageModality()
        bm.sentiment = SentimentModality(data=sentiment_data)
        bm._build_modalities()

        try:
            bm.user_text = ImageModality()
        except ValueError:
            assert True

        try:
            bm.item_text = ImageModality()
        except ValueError:
            assert True

        try:
            bm.user_image = TextModality()
        except ValueError:
            assert True

        try:
            bm.item_image = TextModality()
        except ValueError:
            assert True

        try:
            bm.user_graph = TextModality()
        except ValueError:
            assert True

        try:
            bm.item_graph = ImageModality()
        except ValueError:
            assert True

        try:
            bm.sentiment = TextModality()
        except ValueError:
            assert True

        try:
            bm.sentiment = ImageModality()
        except ValueError:
            assert True
Exemplo n.º 3
0
def load_feedback(variant="closed_loop", reader=None):
    """Load the user-item ratings of one of the Yahoo Music datasets

    Parameters
    ----------
    variant: str, optional, default: 'closed_loop'
        Specifies which Yahoo Music dataset to load, one of ['closed_loop', 'open_loop'].

    reader: `obj:cornac.data.Reader`, optional, default: None
        Reader object used to read the data.

    Returns
    -------
    data: array-like
        Data in the form of a list of tuples depending on the given data format.
    """

    yah = YAHOO_DATASETS.get(variant.upper(), None)
    if yah is None:
        raise ValueError("variant must be one of {}.".format(
            YAHOO_DATASETS.keys()))

    fpath = cache(url=yah.url, unzip=yah.unzip, relative_path=yah.path)
    reader = Reader() if reader is None else reader
    return reader.read(fpath, 'UIR', sep=yah.sep, skip_lines=yah.skip)
Exemplo n.º 4
0
class TestReader(unittest.TestCase):
    def setUp(self):
        self.data_file = './tests/data.txt'
        self.reader = Reader()

    def test_raise(self):
        try:
            self.reader.read(self.data_file, fmt='bla bla')
        except ValueError:
            assert True

    def test_read_ui(self):
        triplets = self.reader.read(self.data_file, fmt='UI')
        self.assertEqual(len(triplets), 30)
        self.assertEqual(triplets[0][1], '93')
        self.assertEqual(triplets[1][2], 1.0)

        triplets = self.reader.read(self.data_file, fmt='UI', id_inline=True)
        self.assertEqual(len(triplets), 40)

    def test_read_uir(self):
        triplet_data = self.reader.read(self.data_file)

        self.assertEqual(len(triplet_data), 10)
        self.assertEqual(triplet_data[4][2], 3)
        self.assertEqual(triplet_data[6][1], '478')
        self.assertEqual(triplet_data[8][0], '543')

    def test_filter(self):
        reader = Reader(bin_threshold=4.0)
        self.assertEqual(len(reader.read(self.data_file)), 8)

        reader = Reader(min_user_freq=2)
        self.assertEqual(len(reader.read(self.data_file)), 0)

        reader = Reader(min_item_freq=2)
        self.assertEqual(len(reader.read(self.data_file)), 0)

        reader = Reader(user_set=['76'], item_set=['93'])
        self.assertEqual(len(reader.read(self.data_file)), 1)

        reader = Reader(user_set=['76', '768'])
        self.assertEqual(len(reader.read(self.data_file)), 2)

        reader = Reader(item_set=['93', '257', '795'])
        self.assertEqual(len(reader.read(self.data_file)), 3)

    def test_read_text(self):
        self.assertEqual(len(read_text(self.data_file, sep=None)), 10)
        self.assertEqual(read_text(self.data_file, sep='\t')[1][0], '76')
Exemplo n.º 5
0
    def test_build(self):
        data = Reader().read('./tests/sentiment_data.txt', fmt='UITup', sep=',', tup_sep=':')
        md = SentimentModality(data=data)

        uid_map = OrderedDict()
        iid_map = OrderedDict()
        for raw_uid, raw_iid, _ in data:
            uid_map.setdefault(raw_uid, len(uid_map))
            iid_map.setdefault(raw_iid, len(iid_map))

        matrix = dok_matrix((len(uid_map), len(iid_map)), dtype=np.float32)

        for raw_uid, raw_iid, _ in data:
            user_idx = uid_map.get(raw_uid)
            item_idx = iid_map.get(raw_iid)
            matrix[user_idx, item_idx] = 1

        md.build(uid_map=uid_map, iid_map=iid_map, dok_matrix=matrix)

        self.assertEqual(md.num_aspects, 3)
        self.assertEqual(md.num_opinions, 2)
        self.assertEqual(len(md.sentiment), 4)
        self.assertEqual(len(md.user_sentiment), 3)
        self.assertEqual(len(md.item_sentiment), 3)
        self.assertEqual(len(md.aspect_id_map), 3)
        self.assertEqual(len(md.opinion_id_map), 2)

        try:
            SentimentModality().build()
        except ValueError:
            assert True
Exemplo n.º 6
0
    def test_get_node_degree(self):
        data = Reader().read('./tests/graph_data.txt', sep=' ')
        gmd = GraphModality(data=data)

        global_iid_map = OrderedDict()
        for raw_iid, raw_jid, val in data:
            global_iid_map.setdefault(raw_iid, len(global_iid_map))

        gmd.build(id_map=global_iid_map)

        degree = gmd.get_node_degree()

        self.assertEqual(degree.get(0)[0], 4)
        self.assertEqual(degree.get(0)[1], 1)
        self.assertEqual(degree.get(1)[0], 2)
        self.assertEqual(degree.get(1)[1], 1)
        self.assertEqual(degree.get(5)[0], 0)
        self.assertEqual(degree.get(5)[1], 1)

        degree = gmd.get_node_degree([0, 1], [0, 1])

        self.assertEqual(degree.get(0)[0], 1)
        self.assertEqual(degree.get(0)[1], 0)
        self.assertEqual(degree.get(1)[0], 0)
        self.assertEqual(degree.get(1)[1], 1)
Exemplo n.º 7
0
 def test_testset_none(self):
     bm = BaseMethod(None, verbose=True)
     bm.train_set = Dataset.from_uir(data=Reader().read("./tests/data.txt"))
     try:
         bm.evaluate(None, {}, False)
     except ValueError:
         assert True
Exemplo n.º 8
0
    def test_from_splits(self):
        data = Reader().read("./tests/data.txt")
        try:
            BaseMethod.from_splits(train_data=None, test_data=None)
        except ValueError:
            assert True

        try:
            BaseMethod.from_splits(train_data=data, test_data=None)
        except ValueError:
            assert True

        try:
            BaseMethod.from_splits(train_data=data,
                                   test_data=[],
                                   exclude_unknowns=True)
        except ValueError:
            assert True

        bm = BaseMethod.from_splits(train_data=data[:-1], test_data=data[-1:])
        self.assertEqual(bm.total_users, 10)
        self.assertEqual(bm.total_items, 10)

        bm = BaseMethod.from_splits(
            train_data=data[:-1],
            test_data=data[-1:],
            val_data=[(data[0][0], data[1][1], 5.0)],
            verbose=True,
        )
        self.assertEqual(bm.total_users, 10)
        self.assertEqual(bm.total_items, 10)
Exemplo n.º 9
0
def test_testset():
    """Test TestSet"""
    data_file = './tests/data.txt'
    u_col = 0
    i_col = 1
    r_col = 2
    sep = '\t'

    triplet_data = Reader.read_uir_triplets(data_file,
                                            u_col,
                                            i_col,
                                            r_col,
                                            sep,
                                            skip_lines=0)

    test_set = TestSet.from_uir_triplets(triplet_data,
                                         pre_uid_map={},
                                         pre_iid_map={},
                                         pre_ui_set=set())

    assert test_set.get_uid('768') == 1
    assert test_set.get_iid('195') == 7

    assert all([a == b for a, b in zip(test_set.get_users(), range(10))])

    assert all([a == b for a, b in zip(test_set.get_ratings(2), [(2, 4)])])

    test_set = TestSet.from_uir_triplets(triplet_data,
                                         pre_uid_map={},
                                         pre_iid_map={},
                                         pre_ui_set=set([('76', '93')]),
                                         verbose=True)
    assert len(test_set.get_users()) == 9
Exemplo n.º 10
0
    def test_data_fmt(self):
        try:
            StratifiedSplit(self.data, fmt="UIR", chrono=True, verbose=True)
        except ValueError:
            assert True

        try:
            data = Reader().read("./tests/data.txt", fmt="UIR", sep="\t")
            StratifiedSplit(data, chrono=True, verbose=True)
        except ValueError:
            assert True
Exemplo n.º 11
0
def test_txt_to_triplets():
    """Test txt_to_triplets function"""

    data_file = './tests/data.txt'
    u_col = 0
    i_col = 1
    r_col = 2
    sep = '\t'

    triplet_data = Reader.read_uir_triplets(data_file, u_col, i_col, r_col, sep, skip_lines=0)

    assert len(triplet_data) == 10
    assert triplet_data[4][2] == 3
    assert triplet_data[6][1] == '478'
    assert triplet_data[8][0] == '543'

    try:
        Reader.read_uir_triplets(data_file, 10)
    except IndexError:
        assert True
Exemplo n.º 12
0
def test_from_provided():
    data_file = './tests/data.txt'
    data = Reader.read_uir_triplets(data_file)

    try:
        BaseMethod.from_provided(train_data=None, test_data=None)
    except ValueError:
        assert True

    try:
        BaseMethod.from_provided(train_data=data, test_data=None)
    except ValueError:
        assert True

    bm = BaseMethod.from_provided(train_data=data, test_data=data)

    assert bm.total_users == 10
    assert bm.total_items == 10
Exemplo n.º 13
0
    def test_get_train_triplet(self):
        data = Reader().read('./tests/graph_data.txt', sep=' ')
        gmd = GraphModule(data=data)

        global_iid_map = OrderedDict()
        for raw_iid, raw_jid, val in data:
            global_iid_map.setdefault(raw_iid, len(global_iid_map))

        gmd.build(id_map=global_iid_map)
        rid, cid, val = gmd.get_train_triplet([0, 1, 2], [0, 1, 2])
        self.assertEqual(len(rid), 3)
        self.assertEqual(len(cid), 3)
        self.assertEqual(len(val), 3)

        rid, cid, val = gmd.get_train_triplet([0, 2], [0, 1])
        self.assertEqual(len(rid), 1)
        self.assertEqual(len(cid), 1)
        self.assertEqual(len(val), 1)
Exemplo n.º 14
0
    def test_init(self):
        triplet_data = Reader().read('./tests/data.txt')
        test_set = TestSet.from_uir(triplet_data,
                                    global_uid_map={},
                                    global_iid_map={},
                                    global_ui_set=set())

        self.assertEqual(test_set.get_uid('768'), 1)
        self.assertEqual(test_set.get_iid('195'), 7)

        self.assertSequenceEqual(test_set.users, range(10))
        self.assertListEqual(test_set.get_ratings(2), [(2, 4)])

        test_set = TestSet.from_uir(triplet_data,
                                    global_uid_map=OrderedDict(),
                                    global_iid_map=OrderedDict(),
                                    global_ui_set=set([('76', '93')]),
                                    verbose=True)
        self.assertEqual(len(test_set.users), 9)
Exemplo n.º 15
0
def test_matrix_trainset_uir_iter():
    data_file = './tests/data.txt'
    triplet_data = Reader.read_uir_triplets(data_file)

    train_set = MatrixTrainSet.from_uir_triplets(triplet_data,
                                                 pre_uid_map={},
                                                 pre_iid_map={},
                                                 pre_ui_set=set(),
                                                 verbose=True)

    users = [batch_users for batch_users, _, _ in train_set.uir_iter()]
    assert all([a == b for a, b in zip(users, range(10))])

    items = [batch_items for _, batch_items, _ in train_set.uir_iter()]
    assert all([a == b for a, b in zip(items, range(10))])

    ratings = [batch_ratings for _, _, batch_ratings in train_set.uir_iter()]
    assert all(
        [a == b for a, b in zip(ratings, [4, 4, 4, 4, 3, 4, 4, 5, 3, 4])])
Exemplo n.º 16
0
    def test_build(self):
        data = Reader().read('./tests/graph_data.txt', sep=' ')
        gmd = GraphModule(data=data)

        global_iid_map = OrderedDict()
        for raw_iid, raw_jid, val in data:
            global_iid_map.setdefault(raw_iid, len(global_iid_map))

        gmd.build(id_map=global_iid_map)

        self.assertEqual(len(gmd.map_rid), 7)
        self.assertEqual(len(gmd.map_cid), 7)
        self.assertEqual(len(gmd.val), 7)
        self.assertEqual(gmd.matrix.shape, (7, 7))

        try:
            GraphModule().build()
        except ValueError:
            assert True
Exemplo n.º 17
0
    def test_from_splits(self):
        data = Reader().read('./tests/data.txt')
        try:
            BaseMethod.from_splits(train_data=None, test_data=None)
        except ValueError:
            assert True

        try:
            BaseMethod.from_splits(train_data=data, test_data=None)
        except ValueError:
            assert True

        bm = BaseMethod.from_splits(train_data=data, test_data=data)
        self.assertEqual(bm.total_users, 10)
        self.assertEqual(bm.total_items, 10)

        bm = BaseMethod.from_splits(train_data=data, test_data=data,
                                    val_data=data, verbose=True)
        self.assertEqual(bm.total_users, 10)
        self.assertEqual(bm.total_items, 10)
Exemplo n.º 18
0
def test_matrix_trainset_uij_iter():
    data_file = './tests/data.txt'
    triplet_data = Reader.read_uir_triplets(data_file)

    train_set = MatrixTrainSet.from_uir_triplets(triplet_data,
                                                 pre_uid_map={},
                                                 pre_iid_map={},
                                                 pre_ui_set=set(),
                                                 verbose=True)

    users = [batch_users for batch_users, _, _ in train_set.uij_iter()]
    assert all([a == b for a, b in zip(users, range(10))])

    pos_items = [
        batch_pos_items for _, batch_pos_items, _ in train_set.uij_iter()
    ]
    assert all([a == b for a, b in zip(pos_items, range(10))])

    neg_items = [
        batch_neg_items for _, _, batch_neg_items in train_set.uij_iter()
    ]
    assert all([a != b for a, b in zip(neg_items, range(10))])
Exemplo n.º 19
0
 def test_evaluate(self):
     data = Reader().read('./tests/data.txt')
     bm = BaseMethod.from_splits(train_data=data, test_data=data)
     model = MF(k=1, max_iter=0)
     result = bm.evaluate(model, metrics=[MAE()], user_based=False)
     result.__str__()
Exemplo n.º 20
0
 def setUp(self):
     self.triplet_data = Reader().read('./tests/data.txt')
Exemplo n.º 21
0
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

import cornac
from cornac.eval_methods import RatioSplit
from cornac.datasets import amazon_clothing
from cornac.data import Reader

# Load the Amazon Clothing  dataset, and binarise ratings using cornac.data.Reader
feedback = amazon_clothing.load_feedback(reader=Reader(bin_threshold=1.0))

# Define an evaluation method to split feedback into train and test sets
ratio_split = RatioSplit(data=feedback,
                         test_size=0.2,
                         rating_threshold=1.0,
                         seed=123,
                         exclude_unknowns=True,
                         verbose=True)

# Instantiate the recommender models to be compared
gmf = cornac.models.GMF(num_factors=8,
                        num_epochs=10,
                        learner='adam',
                        batch_size=256,
                        lr=0.001,
Exemplo n.º 22
0
 def setUp(self):
     self.triplet_data = Reader().read('./tests/data.txt')
     self.uirt_data = Reader().read('./tests/data.txt', fmt='UIRT')
Exemplo n.º 23
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Example for HFT with Movilen 1m dataset """

import cornac
from cornac.data import Reader
from cornac.datasets import movielens
from cornac.eval_methods import RatioSplit
from cornac.data import TextModality
from cornac.data.text import BaseTokenizer

plots, movie_ids = movielens.load_plot()
ml_1m = movielens.load_1m(reader=Reader(item_set=movie_ids))

# build text module
item_text_modality = TextModality(corpus=plots,
                                  ids=movie_ids,
                                  tokenizer=BaseTokenizer(
                                      sep='\t', stop_words='english'),
                                  max_vocab=5000,
                                  max_doc_freq=0.5)

ratio_split = RatioSplit(data=ml_1m,
                         test_size=0.2,
                         exclude_unknowns=True,
                         item_text=item_text_modality,
                         verbose=True,
                         seed=123)
Exemplo n.º 24
0
 def setUp(self):
     data = Reader().read("./tests/data.txt")
     self.eval_method = RatioSplit(data,
                                   test_size=0.2,
                                   val_size=0.2,
                                   exclude_unknowns=False)
Exemplo n.º 25
0
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Example for Collaborative Deep Ranking (CDR)"""

import cornac
from cornac.data import Reader
from cornac.datasets import citeulike
from cornac.eval_methods import RatioSplit
from cornac.data import TextModality
from cornac.data.text import BaseTokenizer

# CDR composes an autoencoder with a ranking collaborative model to represent item texts and user-item interactions
# The necessary data can be loaded as follows
docs, item_ids = citeulike.load_text()
feedback = citeulike.load_feedback(reader=Reader(item_set=item_ids))

# Instantiate a TextModality, it makes it convenient to work with text auxiliary information
# For more details, please refer to the tutorial on how to work with auxiliary data
item_text_modality = TextModality(
    corpus=docs,
    ids=item_ids,
    tokenizer=BaseTokenizer(stop_words="english"),
    max_vocab=8000,
    max_doc_freq=0.5,
)

# Define an evaluation method to split feedback into train and test sets
ratio_split = RatioSplit(
    data=feedback,
    test_size=0.2,
Exemplo n.º 26
0
 def setUp(self):
     self.data = Reader().read('./tests/data.txt')
     self.n_folds = 5
     self.cv = CrossValidation(data=self.data,
                               n_folds=self.n_folds,
                               exclude_unknowns=False)
Exemplo n.º 27
0
    def test_filter(self):
        reader = Reader(bin_threshold=4.0)
        data = reader.read(self.data_file)
        self.assertEqual(len(data), 8)
        self.assertListEqual([x[2] for x in data], [1] * len(data))

        reader = Reader(min_user_freq=2)
        self.assertEqual(len(reader.read(self.data_file)), 0)

        reader = Reader(min_item_freq=2)
        self.assertEqual(len(reader.read(self.data_file)), 0)

        reader = Reader(user_set=['76'], item_set=['93'])
        self.assertEqual(len(reader.read(self.data_file)), 1)

        reader = Reader(user_set=['76', '768'])
        self.assertEqual(len(reader.read(self.data_file)), 2)

        reader = Reader(item_set=['93', '257', '795'])
        self.assertEqual(len(reader.read(self.data_file)), 3)
Exemplo n.º 28
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Example for Bayesian Personalized Ranking with Netflix dataset"""

import cornac
from cornac.data import Reader
from cornac.datasets import netflix
from cornac.eval_methods import RatioSplit

ratio_split = RatioSplit(
    data=netflix.load_data_small(reader=Reader(bin_threshold=1.0)),
    test_size=0.1,
    rating_threshold=1.0,
    exclude_unknowns=True,
    verbose=True)

most_pop = cornac.models.MostPop()
bpr = cornac.models.BPR(k=10,
                        max_iter=100,
                        learning_rate=0.001,
                        lambda_reg=0.01,
                        verbose=True)

auc = cornac.metrics.AUC()
rec_20 = cornac.metrics.Recall(k=20)
Exemplo n.º 29
0
# -*- coding: utf-8 -*-
"""
Example for Collaborative Deep Ranking

@author: Tran Thanh Binh
"""

import cornac
from cornac.data import Reader
from cornac.datasets import citeulike
from cornac.eval_methods import RatioSplit
from cornac.data import TextModule
from cornac.data.text import BaseTokenizer

docs, item_ids = citeulike.load_text()
data = citeulike.load_data(reader=Reader(item_set=item_ids))

# build text module
item_text_module = TextModule(corpus=docs,
                              ids=item_ids,
                              tokenizer=BaseTokenizer('\t'),
                              max_vocab=8000,
                              max_doc_freq=0.5,
                              stop_words='english')

ratio_split = RatioSplit(data=data,
                         test_size=0.2,
                         exclude_unknowns=True,
                         item_text=item_text_module,
                         verbose=True,
                         seed=123,
Exemplo n.º 30
0
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Example for Social Bayesian Personalized Ranking with Epinions dataset"""

import cornac
from cornac.data import Reader, GraphModule
from cornac.datasets import epinions
from cornac.eval_methods import RatioSplit

ratio_split = RatioSplit(data=epinions.load_data(Reader(bin_threshold=4.0)),
                         test_size=0.1,
                         rating_threshold=0.5,
                         exclude_unknowns=True,
                         verbose=True,
                         user_graph=GraphModule(data=epinions.load_trust()))

sbpr = cornac.models.SBPR(k=10,
                          max_iter=50,
                          learning_rate=0.001,
                          lambda_u=0.015,
                          lambda_v=0.025,
                          lambda_b=0.01,
                          verbose=True)
rec_10 = cornac.metrics.Recall(k=10)