Пример #1
0
 def _convert_hash(self, dataset):
     data = Data()
     for key in dataset:
         record = dataset[key]
         batch = [(record[k], key, k) for k in record]
         data.set(batch, extend=True)
     return data
Пример #2
0
def test_data_extend():
    dataset = [(1,2,3), (4,5,6)]
    dataset2 = [(7,8,9), (10,11,12)]
    data = Data()
    data.set(dataset)
    assert_equal(len(data), 2)

    data.set(dataset2, extend=True)
    assert_equal(len(data), 4)
Пример #3
0
def test_data_extend():
    dataset = [(1, 2, 3), (4, 5, 6)]
    dataset2 = [(7, 8, 9), (10, 11, 12)]
    data = Data()
    data.set(dataset)
    assert_equal(len(data), 2)

    data.set(dataset2, extend=True)
    assert_equal(len(data), 4)
Пример #4
0
    def update(self, USER_ID, baseline, path, pred_items):
        print "Loading tweet occurrences pickle..."
        baseline.get_data()._load_pickle(path=path + "tweet_occurrences.p")
        tweet_occurrences = baseline.get_data().get()

        print "Loading count_dict pickle..."
        count_dict = cPickle.load(open(path + "count_dict.p"))

        print "Loading occurrences pickle..."
        occurrences = cPickle.load(open(path + "occurrences.p"))

        total_count = count_dict[USER_ID]
        upd_total_count = int(total_count) + len(pred_items)
        count_dict[USER_ID] = int(upd_total_count)

        print "Dumping count_dict pickle..."
        cPickle.dump(count_dict, open(path + "count_dict.p", "wb"), 2)

        print "Updating counts for known artists..."
        for index, (count, item_id, user_id) in enumerate(tweet_occurrences):
            if str(user_id).encode('utf-8') == USER_ID:
                item_id = str(item_id).encode('utf-8')
                count = occurrences[(item_id, USER_ID)]
                upd_count = float(count) / float(upd_total_count)

                occurrences[(item_id, USER_ID)] = float(upd_count)
                baseline._matrix.set_value(item_id, USER_ID, float(upd_count))
                tweet_occurrences[index] = (float(upd_count), item_id, user_id)

        print "Updating counts for recommended artists..."
        for item_id, relevance in pred_items:
            count = (1.0 / float(upd_total_count))
            baseline._matrix.set_value(item_id, USER_ID, float(count))
            occurrences[(item_id, USER_ID)] = float(count)
            tweet_occurrences.append((float(count), item_id, USER_ID))

        print "Dumping tweet occurrences pickle..."
        data_tweet_occurrences = Data()
        data_tweet_occurrences.set(tweet_occurrences)

        baseline.set_data(data_tweet_occurrences)
        baseline.save_data(filename=path + "tweet_occurrences.p", pickle=True)

        print "Dumping occurrence pickle..."
        cPickle.dump(occurrences, open(path + "occurrences.p", "wb"), protocol=2)

        print "Dumping sparse matrix pickle..."
        cPickle.dump(baseline._matrix.get(), open(path + "sparse_matrix.p", "w"), protocol=2)
Пример #5
0
def get_movie(movie_id):
	movie = {}
	rating = 0
	with sqlite3.connect('data/data100.db') as con:
		cur = con.cursor()
		cur.execute("SELECT * FROM movies WHERE movie_id = ?", (movie_id,))
		movie_result = cur.fetchone()
		cur.execute("SELECT director FROM movie_directors WHERE movie_id = ?", (movie_id,))
		directors = cur.fetchall()
		cur.execute("SELECT actor FROM movie_actors WHERE movie_id = ?", (movie_id,))
		actors = cur.fetchall()
		cur.execute("SELECT writer FROM movie_writers WHERE movie_id = ?", (movie_id,))
		writers = cur.fetchall()
		cur.execute("SELECT genre FROM movie_genres WHERE movie_id = ?", (movie_id,))
		genres = cur.fetchall()
		if 'session_user' in request.cookies:
			cur.execute("SELECT * FROM ratings WHERE user_id = ? AND movie_id = ?", (request.get_cookie('session_user', secret='recsys')[0], movie_id,))
			rating = cur.fetchone()
		cur.execute("SELECT * FROM ratings")
		rating_results = cur.fetchall()
		d = Data()
		d.set(rating_results)
			# with open('data/tmp.dat', 'a') as f:
			# 	for l in rating_results:
			# 		f.write('%d,%d,%d\n' % (l[0], l[1], l[2]))
		svd = SVD()
			# svd.load_data(filename='data/tmp.dat', sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids':int})
		svd.set_data(d)
		similar_list = [str(s[0]) for s in svd.similar(int(movie_id))]
		cur.execute("SELECT * FROM movies WHERE movie_id IN (%s)" % (', '.join(similar_list)))
		similar_movies = cur.fetchall()
		movie = {
			'mid': movie_result[0],
			'title': movie_result[1],
			'description': movie_result[2],
			'image': movie_result[3],
			'year': movie_result[4],
			'directors': [d[0] for d in directors],
			'writers': [w[0] for w in writers],
			'actors': [a[0] for a in actors],
			'genres': [g[0] for g in genres],
			'rating': rating,
			'similar_movies': similar_movies,
		}
	session_user = request.get_cookie('session_user', secret='recsys') if 'session_user' in request.cookies else None
	return template('static/movie.html', movie=movie, session_user=session_user)
Пример #6
0
def get_feeds():
	movielist = {}
	with sqlite3.connect('data/data100.db') as con:
		cur = con.cursor()
		cur.execute("SELECT * FROM ratings WHERE user_id = ?", (request.get_cookie('session_user', secret='recsys')[0],))
		if cur.fetchone():
			cur.execute("SELECT ratings, movie_id, user_id FROM ratings")
			rating_results = cur.fetchall()
			d = Data()
			d.set(rating_results)
			# with open('data/tmp.dat', 'a') as f:
			# 	for l in rating_results:
			# 		f.write('%d,%d,%d\n' % (l[0], l[1], l[2]))
			svd = SVD()
			# svd.load_data(filename='data/tmp.dat', sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids':int})
			svd.set_data(d)
			recommendations = [str(s[0]) for s in svd.recommend(request.get_cookie('session_user', secret='recsys')[0], is_row=False)]
			cur.execute("SELECT * FROM movies WHERE movie_id IN (%s)" % (', '.join(recommendations)))
			similar_movies = cur.fetchall()
			for m in similar_movies:
				movielist[m] = {
					'mid': m[0],
					'title': m[1],
					'description': m[2],
					'image': m[3],
					'year': m[4]
				}
		else:
			cur.execute("SELECT * FROM movies")
			movies = cur.fetchall()
			for m in movies:
				cur.execute("SELECT AVG(ratings) FROM ratings WHERE movie_id = ?", (m[0],))
				avg = cur.fetchone()[0]
				movielist[avg] = {
					'mid': m[0],
					'title': m[1],
					'description': m[2],
					'image': m[3],
					'year': m[4]
				}
	session_user = request.get_cookie('session_user', secret='recsys') if 'session_user' in request.cookies else None
	return template('static/feeds.html', movielist=movielist, session_user=session_user)
Пример #7
0
from recsys.algorithm.factorize import SVD
from recsys.datamodel.data import Data

svd = SVD()
data = Data()
data.load(path='../data/userchlfav',#
          force=True, sep=','
          , format={'col':0, 'row':1, 'ids': int} #, 'value':2
          , pickle=False)

print len(data._data)

for rate in data._data:
    rate[0]

data.set([rate for rate in data._data if rate[1]<1000])

print len(data._data)

svd.set_data(data)

k = 100
svd.compute(k=k,
            min_values=10,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True)


#ITEMID1 = 1    # Toy Story (1995)
#ITEMID2 = 2355 # A bug's life (1998)
Пример #8
0
from recsys.algorithm.factorize import SVD
from recsys.datamodel.data import Data

data = [(4.0, 'user1', 'item1'), (2.0, 'user1', 'item3'),
        (1.0, 'user2', 'item1'), (5.0, 'user2', 'item4')]

d = Data()
d.set(data)
svd = SVD()
svd.set_data(d)
m = svd.get_matrix()
svd.compute(k=2)
print svd.similar('user1')
print svd.predict('user1', 'item1')
Пример #9
0
from recsys.algorithm.factorize import SVD
from recsys.datamodel.data import Data

data = [(4.0, 'user1', 'item1'),
 (2.0, 'user1', 'item3'),
 (1.0, 'user2', 'item1'),
 (5.0, 'user2', 'item4')]

d = Data()
d.set(data)
svd = SVD()
svd.set_data(d)
m = svd.get_matrix()
svd.compute(k=2)
print svd.similar('user1')
print svd.predict('user1', 'item1')