def train_recsys(): from nbot.document import Document, Library, VocabList, load_document doc0 = load_document('res/sample/blubb.html') doc1 = load_document('res/sample/page.html') doc2 = load_document('res/sample/dislikepage.html') lib_like = Library() lib_like.load('res/like', False) lib_dislike = Library() lib_dislike.load('res/dislike', False) like_cv = [] keys = lib_like.get_keys() shuffle(keys) for key in keys[:5]: like_cv.append(lib_like.rmv_document(key)) dislike_cv = [] keys = lib_dislike.get_keys() shuffle(keys) for key in keys[:5]: dislike_cv.append(lib_dislike.rmv_document(key)) vlist_like = lib_like.gen_vocablist() vlist_dislike = lib_dislike.gen_vocablist() vlist_like.clean(10) vlist_dislike.clean(10) like_mask = vlist_like.gen_mask() dislike_mask = vlist_dislike.gen_mask() mask = [] mask.extend(like_mask) mask.extend(dislike_mask) rsys = RecommenderSystem(mask, len(mask)) for key in lib_like.get_keys(): doc = lib_like.get_document(key) rsys.set_rate(doc.content(), 1.) for key in lib_dislike.get_keys(): doc = lib_dislike.get_document(key) rsys.set_rate(doc.content(), 0.) rsys.train(10000000, 0.1) likes = lib_like.get_keys() shuffle(likes) for key in likes[:5]: doc = lib_like.get_document(key) print rsys.rate(doc.content()) dislikes = lib_dislike.get_keys() shuffle(dislikes) for key in dislikes[:5]: doc = lib_dislike.get_document(key) print rsys.rate(doc.content()) print '---------------------------------------' print rsys.rate(doc0.content()) print rsys.rate(doc1.content()) print rsys.rate(doc2.content()) print '---------------------------------------' print 'CV data' print '(1) LIKE' for doc in like_cv: print rsys.rate(doc.content()) print '(2) DISLIKE' for doc in dislike_cv: print rsys.rate(doc.content()) # This seems to work, however, more training/cv data will be necessary! print '---------------------------------------' return rsys
if __name__ == '__main__': # some tests from nbot.document import Document, Library, VocabList doc0 = load_document('res/sample/blubb.html') doc1 = load_document('res/sample/page.html') doc2 = load_document('res/sample/dislikepage.html') lib_like = Library() lib_like.load('res/like', False) lib_dislike = Library() lib_dislike.load('res/dislike', False) like_cv = [] keys = lib_like.get_keys() shuffle(keys) for key in keys[:5]: like_cv.append(lib_like.rmv_document(key)) dislike_cv = [] keys = lib_dislike.get_keys() shuffle(keys) for key in keys[:5]: dislike_cv.append(lib_dislike.rmv_document(key)) vlist_like = lib_like.gen_vocablist() vlist_dislike = lib_dislike.gen_vocablist() vlist_like.clean(10) vlist_dislike.clean(10)