示例#1
0
    resultsdfpath = os.path.join(modeldir, 'results_df.p')

    dataset = np.load(os.path.join(datadir, dataname))
    train_ratings = load_npz(os.path.join(datadir, train_matrix)).todok()
    test_ratings, negatives = dataset['test_negative'], dataset['negatives']
    n_users, n_items = dataset['n_users'].item(), dataset['n_items'].item()

    test_loader = DataLoader(dataset=test_ratings,
        batch_size=1000,
        shuffle=False
        )

    model = NeuMF(n_users, n_items, n_emb, layers, dropouts)
    if os.path.isfile(mf_pretrain) and os.path.isfile(mlp_pretrain):
        gmf_model = GMF(n_users, n_items, n_emb)
        gmf_model.load_state_dict(torch.load(mf_pretrain))
        mlp_model = MLP(n_users, n_items, layers, dropouts)
        mlp_model.load_state_dict(torch.load(mlp_pretrain))
        model = load_pretrain_model(model, gmf_model, mlp_model)
        print("Load pretrained GMF {} and MLP {} models done. ".format(mf_pretrain, mlp_pretrain))

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        model = model.cuda()

    if freeze:
        for name, layer in model.named_parameters():
            if not ("out" in name):
                layer.requires_grad = False

    # or this and pass train_parametes to the optimizer
示例#2
0
def parse(path):
	g = gzip.open(path, 'rb')
	for l in g:
    	yield eval(l)

def getDF(path):
	i = 0
	df = {}
	for d in parse(path):
		df[i] = d
	    i += 1
	return pd.DataFrame.from_dict(df, orient='index')

DATA_PATH = Path(".")
MODEL_DIR = "models"

asin2id_map = pickle.load(open(DATA_PATH/'item_mappings.p', 'rb'))
id2asin_map = {k:v for v,k in asin2id_map.items()}

df_movies_meta_data = getDF(DATA_PATH/'meta_Movies_and_TV.json.gz')
keep_cols = ['asin', 'title']
df_movies_meta_data = df_movies_meta_data[keep_cols]
df_movies_meta_data = df_movies_meta_data[~df_movies_meta_data.title.isna()]
asin2title_map = dict(df_movies_meta_data.values)

print("number of items with missing title in the core dataset: {}".format(
	np.setdiff1d(list(id2asin_map.values()), list(asin2title_map.keys())).shape[0]))
print("number of items with non missing titles in the core dataset: {}".format(
	len(id2asin_map) \
	- np.setdiff1d(list(id2asin_map.values()), list(asin2title_map.keys())).shape[0]))

id2title_map = {}
for k,v in id2asin_map.items():
	try:
		id2title_map[k] = asin2title_map[v]
	except:
		continue

df_results = pd.read_pickle(DATA_PATH/MODEL_DIR/'results_df.p')
best_gmf = (df_results[df_results.modelname.str.contains('GMF')]
	.sort_values('best_hr', ascending=False)
	.reset_index(drop=True)
	).modelname[0]
n_emb_i = int(np.where([s == 'emb' for s in best_gmf.split("_")])[0])+1
n_emb = int(best_gmf.split("_")[n_emb_i])

dataset = np.load(DATA_PATH/'neuralcf_split.npz')
n_users, n_items = dataset['n_users'].item(), dataset['n_items'].item()

gmf_model = GMF(n_users, n_items, n_emb)
gmf_model.load_state_dict(torch.load(DATA_PATH/MODEL_DIR/best_gmf))
item_embeddings = gmf_model.embeddings_item.weight.data.numpy()

knn_model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn_model.fit(item_embeddings)

def get_movie_titles(input_id, n=20):
	"""first movie will be the "query" movie and the remaining n-1 the similar
	movies. Similar defined under the functioning of the algorithm, i.e.
	leading to the same prediction"""
	dist, nnidx = knn_model.kneighbors(
		item_embeddings[input_id].reshape(1, -1),
		n_neighbors = n)
	titles = []
	for idx in nnidx[0]:
		try:
			titles.append(id2title_map[idx])
		except:
			continue
	return titles

similar_movies = get_movie_titles(1234)