def test_db(): gl_vlist = VocabList() log('searching directory: %s' % FEED_DIR) for dir in os.listdir(FEED_DIR): if '.mk4' in dir[-4:]: log('found database: %s' % dir) # open database db = metakit.storage(os.path.join(FEED_DIR, dir), 0) data = read_database(db) if len(data) > 0: # feed content in database log('create library') lib = Library() for feed in data: lib.add_document(read_data(feed)) vlist = lib.gen_vocablist() vlist.clean(5) gl_vlist.merge(vlist) db = None # close database print gl_vlist
def train_recsys(): from nbot.document import Document, Library, VocabList, load_document doc0 = load_document('res/sample/blubb.html') doc1 = load_document('res/sample/page.html') doc2 = load_document('res/sample/dislikepage.html') lib_like = Library() lib_like.load('res/like', False) lib_dislike = Library() lib_dislike.load('res/dislike', False) like_cv = [] keys = lib_like.get_keys() shuffle(keys) for key in keys[:5]: like_cv.append(lib_like.rmv_document(key)) dislike_cv = [] keys = lib_dislike.get_keys() shuffle(keys) for key in keys[:5]: dislike_cv.append(lib_dislike.rmv_document(key)) vlist_like = lib_like.gen_vocablist() vlist_dislike = lib_dislike.gen_vocablist() vlist_like.clean(10) vlist_dislike.clean(10) like_mask = vlist_like.gen_mask() dislike_mask = vlist_dislike.gen_mask() mask = [] mask.extend(like_mask) mask.extend(dislike_mask) rsys = RecommenderSystem(mask, len(mask)) for key in lib_like.get_keys(): doc = lib_like.get_document(key) rsys.set_rate(doc.content(), 1.) for key in lib_dislike.get_keys(): doc = lib_dislike.get_document(key) rsys.set_rate(doc.content(), 0.) rsys.train(10000000, 0.1) likes = lib_like.get_keys() shuffle(likes) for key in likes[:5]: doc = lib_like.get_document(key) print rsys.rate(doc.content()) dislikes = lib_dislike.get_keys() shuffle(dislikes) for key in dislikes[:5]: doc = lib_dislike.get_document(key) print rsys.rate(doc.content()) print '---------------------------------------' print rsys.rate(doc0.content()) print rsys.rate(doc1.content()) print rsys.rate(doc2.content()) print '---------------------------------------' print 'CV data' print '(1) LIKE' for doc in like_cv: print rsys.rate(doc.content()) print '(2) DISLIKE' for doc in dislike_cv: print rsys.rate(doc.content()) # This seems to work, however, more training/cv data will be necessary! print '---------------------------------------' return rsys
lib_dislike = Library() lib_dislike.load('res/dislike', False) like_cv = [] keys = lib_like.get_keys() shuffle(keys) for key in keys[:5]: like_cv.append(lib_like.rmv_document(key)) dislike_cv = [] keys = lib_dislike.get_keys() shuffle(keys) for key in keys[:5]: dislike_cv.append(lib_dislike.rmv_document(key)) vlist_like = lib_like.gen_vocablist() vlist_dislike = lib_dislike.gen_vocablist() vlist_like.clean(10) vlist_dislike.clean(10) like_mask = vlist_like.gen_mask() dislike_mask = vlist_dislike.gen_mask() printlist(like_mask) print '-------------------------------------------' printlist(dislike_mask) mask = [] mask.extend(like_mask) mask.extend(dislike_mask)