示例#1
0
class TestBaseTokenizer(unittest.TestCase):
    def setUp(self):
        self.tok = BaseTokenizer()

    def test_init(self):
        self.assertEqual(self.tok.sep, ' ')

    def test_tokenize(self):
        tokens = self.tok.tokenize('a b c')
        self.assertListEqual(tokens, ['a', 'b', 'c'])

    def test_batch_tokenize(self):
        token_list = self.tok.batch_tokenize(['a b c', 'd e f'])
        self.assertListEqual(token_list, [['a', 'b', 'c'], ['d', 'e', 'f']])

    def test_default_rules(self):
        tok = BaseTokenizer(pre_rules=DEFAULT_PRE_RULES)
        token_list = tok.tokenize('<t>a</t> B |{ C ]?&$  d123 E')
        self.assertListEqual(token_list, ['a', 'b', 'c', 'd', 'e'])

    def test_stopwords(self):
        text = 'this is a nice house'

        tok = BaseTokenizer(stop_words='english')
        self.assertListEqual(tok.tokenize(text), ['nice', 'house'])

        tok = BaseTokenizer(stop_words=['is', 'a'])
        self.assertListEqual(tok.tokenize(text), ['this', 'nice', 'house'])

        try:
            BaseTokenizer(stop_words='vietnamese')
        except ValueError:
            assert True
示例#2
0
    def test_stopwords(self):
        text = 'this is a nice house'

        tok = BaseTokenizer(stop_words='english')
        self.assertListEqual(tok.tokenize(text), ['nice', 'house'])

        tok = BaseTokenizer(stop_words=['is', 'a'])
        self.assertListEqual(tok.tokenize(text), ['this', 'nice', 'house'])

        try:
            BaseTokenizer(stop_words='vietnamese')
        except ValueError:
            assert True
示例#3
0
"""

import cornac
from cornac.data import Reader
from cornac.datasets import citeulike
from cornac.eval_methods import RatioSplit
from cornac.data import TextModule
from cornac.data.text import BaseTokenizer

docs, item_ids = citeulike.load_text()
data = citeulike.load_data(reader=Reader(item_set=item_ids))

# build text module
item_text_module = TextModule(corpus=docs,
                              ids=item_ids,
                              tokenizer=BaseTokenizer('\t'),
                              max_vocab=8000,
                              max_doc_freq=0.5,
                              stop_words='english')

ratio_split = RatioSplit(data=data,
                         test_size=0.2,
                         exclude_unknowns=True,
                         item_text=item_text_module,
                         verbose=True,
                         seed=123,
                         rating_threshold=0.5)

cdr = cornac.models.CDR(k=50,
                        autoencoder_structure=[200],
                        max_iter=100,
示例#4
0
"""Example for HFT with Movilen 1m dataset """

import cornac
from cornac.data import Reader
from cornac.datasets import movielens
from cornac.eval_methods import RatioSplit
from cornac.data import TextModality
from cornac.data.text import BaseTokenizer

plots, movie_ids = movielens.load_plot()
ml_1m = movielens.load_1m(reader=Reader(item_set=movie_ids))

# build text module
item_text_modality = TextModality(corpus=plots,
                                  ids=movie_ids,
                                  tokenizer=BaseTokenizer(
                                      sep='\t', stop_words='english'),
                                  max_vocab=5000,
                                  max_doc_freq=0.5)

ratio_split = RatioSplit(data=ml_1m,
                         test_size=0.2,
                         exclude_unknowns=True,
                         item_text=item_text_modality,
                         verbose=True,
                         seed=123)

hft = cornac.models.HFT(k=10,
                        max_iter=40,
                        grad_iter=5,
                        l2_reg=0.001,
                        lambda_text=0.01,
示例#5
0
import cornac
from cornac.data import Reader
from cornac.datasets import citeulike
from cornac.eval_methods import RatioSplit
from cornac.data import TextModality
from cornac.data.text import BaseTokenizer

docs, item_ids = citeulike.load_text()
data = citeulike.load_data(reader=Reader(item_set=item_ids))

# build text module
item_text_modality = TextModality(
    corpus=docs,
    ids=item_ids,
    tokenizer=BaseTokenizer(stop_words='english'),
    max_vocab=8000,
    max_doc_freq=0.5)

ratio_split = RatioSplit(data=data,
                         test_size=0.2,
                         exclude_unknowns=True,
                         item_text=item_text_modality,
                         verbose=True,
                         seed=123,
                         rating_threshold=0.5)

cdr = cornac.models.CDR(k=50,
                        autoencoder_structure=[200],
                        max_iter=100,
                        batch_size=128,
示例#6
0
from cornac.datasets import citeulike
from cornac.eval_methods import RatioSplit
from cornac.data import TextModality
from cornac.data.text import BaseTokenizer

# CDR composes an autoencoder with a ranking collaborative model to represent item texts and user-item interactions
# The necessary data can be loaded as follows
docs, item_ids = citeulike.load_text()
feedback = citeulike.load_feedback(reader=Reader(item_set=item_ids))

# Instantiate a TextModality, it makes it convenient to work with text auxiliary information
# For more details, please refer to the tutorial on how to work with auxiliary data
item_text_modality = TextModality(
    corpus=docs,
    ids=item_ids,
    tokenizer=BaseTokenizer(stop_words="english"),
    max_vocab=8000,
    max_doc_freq=0.5,
)

# Define an evaluation method to split feedback into train and test sets
ratio_split = RatioSplit(
    data=feedback,
    test_size=0.2,
    exclude_unknowns=True,
    item_text=item_text_modality,
    verbose=True,
    seed=123,
    rating_threshold=0.5,
)
示例#7
0
from cornac.eval_methods import RatioSplit
from cornac.data import TextModality
from cornac.data.text import BaseTokenizer

# HFT jointly models the user-item preferences and item texts (e.g., product reviews) with shared item factors
# Below we fit HFT to the MovieLens 1M dataset. We need  both the ratings and movie plots information
plots, movie_ids = movielens.load_plot()
ml_1m = movielens.load_feedback(variant="1M",
                                reader=Reader(item_set=movie_ids))

# Instantiate a TextModality, it makes it convenient to work with text auxiliary information
# For more details, please refer to the tutorial on how to work with auxiliary data
item_text_modality = TextModality(
    corpus=plots,
    ids=movie_ids,
    tokenizer=BaseTokenizer(sep="\t", stop_words="english"),
    max_vocab=5000,
    max_doc_freq=0.5,
)

# Define an evaluation method to split feedback into train and test sets
ratio_split = RatioSplit(
    data=ml_1m,
    test_size=0.2,
    exclude_unknowns=True,
    item_text=item_text_modality,
    verbose=True,
    seed=123,
)

# Instantiate HFT model
示例#8
0
@author: Tran Thanh Binh
"""

import cornac
from cornac.data import Reader
from cornac.datasets import citeulike
from cornac.eval_methods import RatioSplit
from cornac.data import TextModule
from cornac.data.text import BaseTokenizer


docs, item_ids = citeulike.load_text()
data = citeulike.load_data(reader=Reader(item_set=item_ids))

# build text module
item_text_module = TextModule(corpus=docs, ids=item_ids,
                              tokenizer=BaseTokenizer(sep=' '),
                              max_vocab=8000, max_doc_freq=0.5,
                              stop_words='english')

ratio_split = RatioSplit(data=data, test_size=0.2, exclude_unknowns=True,
                         item_text=item_text_module, verbose=True, seed=123, rating_threshold=0.5)
cdl = cornac.models.CDL(k=50, autoencoder_structure=[200], max_iter=30,
                        lambda_u=0.1, lambda_v=1, lambda_w=0.1, lambda_n=1000)
rec_300 = cornac.metrics.Recall(k=300)

exp = cornac.Experiment(eval_method=ratio_split,
                        models=[cdl],
                        metrics=[rec_300])
exp.run()
示例#9
0
 def test_default_rules(self):
     tok = BaseTokenizer(pre_rules=DEFAULT_PRE_RULES)
     token_list = tok.tokenize('<t>a</t> B |{ C ]?&$  d123 E')
     self.assertListEqual(token_list, ['a', 'b', 'c', 'd', 'e'])
示例#10
0
 def setUp(self):
     self.tok = BaseTokenizer()