예제 #1
0
def computeHistograms():
    """Calculate color histograms for all covers."""
    imgPath = cfg.databasePath() + '/Covers'
    config = cfg.readConfig()
    hBins = config.getint('Images', 'h_bins')
    sBins = config.getint('Images', 's_bins')
    with open(cfg.databasePath() + '/Sets.json', 'r') as setsFile:
        sets = json.load(setsFile)
    coverIds = [int(x.split('.')[0]) for x in os.listdir(imgPath)]
    for curSet in sets:
        idList = [x for x in sets[curSet] if x in coverIds]
        data = np.zeros((len(idList), hBins * sBins + 1))
        data[:, 0] = idList
        for i in range(len(idList)):
            coverFilename = imgPath + '/{}.jpg'.format(idList[i])
            img = cv2.imread(coverFilename, cv2.IMREAD_COLOR)
            if img is None or img.shape[2] != 3:
                print('Skipping ' + str(idList[i]))
                continue
            img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
            for j in range(sBins):
                minS = j * 256 / sBins
                maxS = (j + 1) * 256 / sBins
                relPix = img[np.logical_and(img[:, :, 1] > minS,
                                            img[:, :, 1] < maxS)]
                data[i, 1+hBins*j:1+hBins*(j+1)], _ = \
                 np.histogram(relPix[:, 0], bins=hBins)
        data[:, 1:] = data[:, 1:] / np.sum(data[:, 1:], axis=1)[:, np.newaxis]
        data = data[~np.isnan(data).any(axis=1)]
        np.savetxt(cfg.databasePath() + '/{}_hist.csv'.format(curSet), data)
예제 #2
0
def buildBagOfWords():
    """Transform image features to a BoW vector."""
    featPath = cfg.databasePath() + '/Features'
    config = cfg.readConfig()
    k = config.getint('Images', 'n_clusters')
    centers = np.loadtxt(cfg.databasePath() + '/Centers.csv')
    searcher = NearestNeighbors(n_neighbors=1, n_jobs=-1)
    searcher.fit(centers)
    with open(cfg.databasePath() + '/Sets.json', 'r') as setsFile:
        sets = json.load(setsFile)
    for curSet in sets:
        if os.path.exists(cfg.databasePath() + '/{}_img.csv'.format(curSet)):
            print('Skipping {} set.'.format(curSet))
            continue
        idList = [
            x for x in sets[curSet]
            if os.path.exists(featPath + '/{}.npy'.format(x))
        ]
        data = np.zeros((len(idList), k + 1))
        data[:, 0] = idList
        for i in range(len(idList)):
            features = np.load(featPath + '/{}.npy'.format(idList[i]))
            assignments = searcher.kneighbors(features, return_distance=False)
            for item in assignments:
                data[i, item + 1] += 1
        data[:, 1:] = data[:, 1:] / np.sum(data[:, 1:], axis=1)[:, np.newaxis]
        np.savetxt(cfg.databasePath() + '/{}_img.csv'.format(curSet), data)
예제 #3
0
def downloadSwitch():
	"""Download games for the Nintendo Switch platform."""
	if not os.path.exists(cfg.databasePath() + '/Games'):
		os.makedirs(cfg.databasePath() + '/Games')
	config = cfg.readConfig()
	fields = config['Database']['fields'].split(',')
	api = igdb(config['Database']['api_key'])
	res = api.games({
		'fields': fields,
		'filters': {
			'[release_dates.platform][any]': 130
		},
		'scroll': 1,
		'limit': 50
	})
	for game in res.body:
		filename = cfg.databasePath() + '/Games/{}.json'.format(game['id'])
		with open(filename, 'w') as outFile:
			json.dump(game, outFile, indent='\t')
	nPages = round(int(res.headers['X-Count']) / 50)
	for _ in range(nPages):
		scrolled = api.scroll(res)
		if type(scrolled.body) is list:
			for game in scrolled.body:
				filename = cfg.databasePath() + '/Games/{}.json'.format(game['id'])
				with open(filename, 'w') as outFile:
					json.dump(game, outFile, indent='\t')
예제 #4
0
def splitDatabase():
    """Split database into train, validation and test sets."""
    config = cfg.readConfig()
    dataPath = cfg.databasePath() + '/Games'
    idList = []
    for file in os.listdir(dataPath):
        if os.path.isfile(dataPath + '/' + file):
            idList.append(int(file.split('.')[0]))
    trainSize = int(config.getfloat('Database', 'train_size') * len(idList))
    validSize = int(config.getfloat('Database', 'valid_size') * len(idList))
    sets = {}
    sets['train'] = random.sample(idList, trainSize)
    idList = [x for x in idList if x not in sets['train']]
    sets['valid'] = random.sample(idList, validSize)
    sets['test'] = [x for x in idList if x not in sets['valid']]
    with open(cfg.databasePath() + '/Sets.json', 'w') as outFile:
        json.dump(sets, outFile)
예제 #5
0
def clusterFeatures():
    """Perform clustering on the extracted features."""
    featPath = cfg.databasePath() + '/Features'
    with open(cfg.databasePath() + '/Sets.json', 'r') as setsFile:
        idList = json.load(setsFile)['train']
    arrays = (np.load(featPath + '/' + str(id) + '.npy', allow_pickle=False)
              for id in idList
              if os.path.exists(featPath + '/' + str(id) + '.npy'))
    features = np.vstack(arrays)
    print(features.shape)
    print('Loading complete')
    config = cfg.readConfig()
    k = config.getint('Images', 'n_clusters')
    model = MiniBatchKMeans(n_clusters=k,
                            batch_size=50000,
                            verbose=True,
                            compute_labels=False)
    model.fit(features)
    np.savetxt(cfg.databasePath() + '/Centers.csv', model.cluster_centers_)
예제 #6
0
def extractFeatures():
    """Extract SIFT features from all the images in the database."""
    imgPath = cfg.databasePath() + '/Covers'
    featPath = cfg.databasePath() + '/Features'
    sift = cv2.xfeatures2d.SIFT_create()
    for file in os.listdir(imgPath):
        baseName = file.split('.')[0]
        if os.path.exists(featPath + '/' + baseName + '.npy'):
            continue
        img = cv2.imread(imgPath + '/' + file, cv2.IMREAD_GRAYSCALE)
        if img is None:
            print(file + ' skipped')
            continue
        _, desc = sift.detectAndCompute(img, None)
        if desc is None or len(desc.shape) < 2:
            print(file + ' skipped')
            continue
        desc = desc / np.sum(desc, axis=1)[:, np.newaxis]
        desc = np.clip(desc, None, 0.2)
        desc = desc / np.sum(desc, axis=1)[:, np.newaxis]
        np.save(featPath + '/' + baseName, desc, allow_pickle=False)
예제 #7
0
def downloadCovers():
	"""Download covers for the games present in the database."""
	coverPath = cfg.databasePath() + '/Covers/'
	dataPath = cfg.databasePath() + '/Games/'
	if not os.path.exists(coverPath):
		os.makedirs(coverPath)
	for file in os.listdir(dataPath):
		with open(dataPath + file, 'r') as inFile:
			data = json.load(inFile)
		try:
			url = 'https:' + data['cover']['url'].replace('thumb', 'cover_big')
			urlFile = coverPath + url.split('/')[-1]
			localFile = file.split('.')[0] + '.' + urlFile.split('.')[-1]
			if os.path.isfile(coverPath + '/' + localFile):
				print('Skipping {}'.format(data['name']))
			else:
				response = requests.get(url)
				with open(coverPath + '/' + localFile, 'wb') as outFile:
					outFile.write(response.content)
		except KeyError:
			print('Skipping {}: no cover.'.format(data['name']))
예제 #8
0
def encodeData():
	"""Encode game metadata."""
	config = cfg.readConfig()
	esrbCode = config.get('Preprocessing', 'esrb').split(',')
	modesCode = config.get('Preprocessing', 'game_modes').split(',')
	genresCode = config.get('Preprocessing', 'genres').split(',')
	themesCode = config.get('Preprocessing', 'themes').split(',')
	modesCode = [int(x) for x in modesCode]
	genresCode = [int(x) for x in genresCode]
	themesCode = [int(x) for x in themesCode]
	dim2 = sum([len(esrbCode), len(modesCode), len(genresCode), len(themesCode)])
	with open(cfg.databasePath() + '/Sets.json', 'r') as setsFile:
		sets = json.load(setsFile)
	for set in sets:
		idList = sets[set]
		data = np.zeros((len(idList), dim2 + 1), dtype=int)
		data[:, 0] = idList
		y = np.zeros((len(idList), 2))
		y[:, 0] = idList
		for i in range(len(idList)):
			p = 1
			gamePath = cfg.databasePath() + '/Games/{}.json'.format(idList[i])
			with open(gamePath, 'r') as gameFile:
				gameData = json.load(gameFile)
			esrb = gameData['esrb']['rating'] if 'esrb' in gameData else 1
			data[i, p:p+len(esrbCode)] = \
				[1 if esrb - 1 == i else 0 for i in range(len(esrbCode))]
			p += len(esrbCode)
			modes = gameData['game_modes'] if 'game_modes' in gameData else []
			data[i, p:p+len(modesCode)] = encodeMultiLabel(modes, modesCode)
			p += len(modesCode)
			genres = gameData['genres'] if 'genres' in gameData else []
			data[i, p:p+len(genresCode)] = encodeMultiLabel(genres, genresCode)
			p += len(genresCode)
			themes = gameData['themes'] if 'themes' in gameData else []
			data[i, p:p+len(themesCode)] = encodeMultiLabel(themes, themesCode)
			y[i, 1] = gameData['aggregated_rating']
		np.savetxt(cfg.databasePath() + '/{}_data.csv'.format(set), data, fmt='%d')
		np.savetxt(cfg.databasePath() + '/{}_y.csv'.format(set), y, fmt='%.2f')
예제 #9
0
def vectorizeImages():
    """Transform game covers into a vector suitable for regression."""
    print('Extracting features')
    extractFeatures()
    print('Clustering features')
    if os.path.exists(cfg.databasePath() + '/Centers.csv'):
        print('Cluster centers found. Skipping.')
    else:
        clusterFeatures()
    print('Computing histograms')
    computeHistograms()
    print('Vectorizing')
    buildBagOfWords()
예제 #10
0
def main():
	"""Script entry point."""
	if len(sys.argv) == 2:
		n = int(sys.argv[1])
	else:
		n = 10
	xTrain = db.load('rank_train', 'data')
	yTrain = db.load('rank_train', 'y')
	xTest = db.load('rank_test', 'data')
	for set in ['text', 'hist', 'img']:
		nTrain = db.load('rank_train', set)
		nTest = db.load('rank_test', set)
		iTrain = [x for x in xTrain[:, 0] if x in nTrain[:, 0]]
		xTrain = np.hstack((
			xTrain[np.isin(xTrain[:, 0], iTrain), :],
			nTrain[np.isin(nTrain[:, 0], iTrain), 1:]
		))
		iTest = [x for x in xTest[:, 0] if x in nTest[:, 0]]
		xTest = np.hstack((
			xTest[np.isin(xTest[:, 0], iTest), :],
			nTest[np.isin(nTest[:, 0], iTest), 1:]
		))
	gamesPath = cfg.databasePath() + '/Games/{}.json'
	names = []
	for i in range(xTest.shape[0]):
		with open(gamesPath.format(int(xTest[i, 0])), 'r') as gameFile:
			names.append(json.load(gameFile)['name'])
	model = SGDRegressor(max_iter=100)
	model.fit(xTrain[:, 1:], yTrain[:, 1])
	pTrain = model.predict(xTrain[:, 1:])
	eTrain = np.sqrt(mean_squared_error(yTrain[:, 1], pTrain))
	print('Training error: {}'.format(eTrain))
	pTest = model.predict(xTest[:, 1:])
	sortIdx = np.argsort(pTest)
	for i in range(1, n + 1):
		idx = sortIdx[-i]
		print('{:.2f}\t{}'.format(pTest[idx], names[idx]))
예제 #11
0
def load(set, part):
    """Load one of the three parts of the specified set."""
    filename = cfg.databasePath() + '/{}_{}.csv'.format(set, part)
    return np.loadtxt(filename)
예제 #12
0
def vectorizeSummaries():
	"""Create word count matrix from game summaries."""
	with open(cfg.configPath() + '/Vocabulary.txt', 'r') as inFile:
		vocab = [x.rstrip() for x in inFile]
	config = cfg.readConfig()
	params = config['Text']
	vectorizer = CountVectorizer(**params)
	vectorizer.set_params(vocabulary=vocab)
	with open(cfg.databasePath() + '/Sets.json', 'r') as setsFile:
		sets = json.load(setsFile)
	ids = []
	summaries = []
	for id in sets['train']:
		with open(cfg.databasePath() + '/Games/{}.json'.format(id), 'r') as inFile:
			gameData = json.load(inFile)
		if 'summary' in gameData:
			ids.append(id)
			summaries.append(gameData['summary'])
	vectorizer.fit(summaries)
	train = np.zeros((len(ids), 1 + len(vectorizer.get_feature_names())))
	train[:, 0] = ids
	train[:, 1:] = vectorizer.transform(summaries).todense()
	np.savetxt(cfg.databasePath() + '/train_text.csv', train, fmt='%d')
	ids = []
	summaries = []
	for id in sets['valid']:
		with open(cfg.databasePath() + '/Games/{}.json'.format(id), 'r') as inFile:
			gameData = json.load(inFile)
		if 'summary' in gameData:
			ids.append(id)
			summaries.append(gameData['summary'])
	valid = np.zeros((len(ids), 1 + len(vectorizer.get_feature_names())))
	valid[:, 0] = ids
	valid[:, 1:] = vectorizer.transform(summaries).todense()
	np.savetxt(cfg.databasePath() + '/valid_text.csv', valid, fmt='%d')
	ids = []
	summaries = []
	for id in sets['test']:
		with open(cfg.databasePath() + '/Games/{}.json'.format(id), 'r') as inFile:
			gameData = json.load(inFile)
		if 'summary' in gameData:
			ids.append(id)
			summaries.append(gameData['summary'])
	test = np.zeros((len(ids), 1 + len(vectorizer.get_feature_names())))
	test[:, 0] = ids
	test[:, 1:] = vectorizer.transform(summaries).todense()
	np.savetxt(cfg.databasePath() + '/test_text.csv', test, fmt='%d')
	ids = []
	summaries = []
	for id in sets['rank_train']:
		with open(cfg.databasePath() + '/Games/{}.json'.format(id), 'r') as inFile:
			gameData = json.load(inFile)
		if 'summary' in gameData:
			ids.append(id)
			summaries.append(gameData['summary'])
	test = np.zeros((len(ids), 1 + len(vectorizer.get_feature_names())))
	test[:, 0] = ids
	test[:, 1:] = vectorizer.transform(summaries).todense()
	np.savetxt(cfg.databasePath() + '/rank_train_text.csv', test, fmt='%d')
	ids = []
	summaries = []
	for id in sets['rank_test']:
		with open(cfg.databasePath() + '/Games/{}.json'.format(id), 'r') as inFile:
			gameData = json.load(inFile)
		if 'summary' in gameData:
			ids.append(id)
			summaries.append(gameData['summary'])
	test = np.zeros((len(ids), 1 + len(vectorizer.get_feature_names())))
	test[:, 0] = ids
	test[:, 1:] = vectorizer.transform(summaries).todense()
	np.savetxt(cfg.databasePath() + '/rank_test_text.csv', test, fmt='%d')