Exemplo n.º 1
0
def predict_testdata(cat_info,dataroot):
## input should be a data frame
##  output is a list of popular skus in the right order
	testdat = pd.read_csv(dataroot)
	num_samples = len(testdat)
	skulist = []

	for i in xrange(num_samples):
		dat = testdat.iloc[i:i+1] ## output should be a data frame
		cat = dat['category'].iloc[0]
		catdat = preprocess(dat)

		try:
			catdic = cat_info[cat]['sku_info']
		except KeyError:
			print "Category %s is unseen!" % str(cat)
			raise KeyError

		testfsets = get_test_featuresets(catdat,catdic)
		cls = cat_info[cat]['cls']
		yclasses =  cls.classes_
		yall = cls.predict_proba(testfsets)
		ysort = np.argsort(-yall)

		n = 5
		try:
			ybest = ysort[:,:n] ## get the most frequent n
		except IndexError:
			try:
				ybest = ysort[:,:len(yclasses)] ## only one class
			except IndexError:
				ybest = ysort # if ysort is shorter than n, get ysort
		
		yout = yclasses[ybest]
		skulist.append(yout.flatten().tolist())
	
	return skulist
Exemplo n.º 2
0
def main():
	start = timeit.default_timer()
	print "read train data"
	dataroot = "../data/train.csv"

	gcat_dic = groupByCat(dataroot)

	cat_list = gcat_dic.keys()
####################
## preprocess the data
####################

####################
## feature selections
####################
	cat_info = dict()
	for cat in cat_list:
		#print "preprocessing data"
		catdat = preprocess(gcat_dic[cat])

		#print "feature selections"
		sku_info,fset,skus = getFeatureSet(catdat)

		cat_info[cat] = {}
		cat_info[cat]['sku_info'] = sku_info
	
##### method 1 #####
## choose the most frequent 5 skus

##### method 2 #####
## or choose the skus with frequency > n, n is user specified

## the final feature set is a matrix X, (n_samples, n_skus)
##	and a column of sku, (n_skus)
####################
## train NB classifiers
####################
		#print "training data"
		cls = naive_bayes.MultinomialNB(alpha=0.1)
		cls.fit(fset,skus)
		cat_info[cat]['cls'] = cls
	
#
### release the memory
#	gcat_dic = dict()
####################
## predict
####################
## preprocess test data
	print "read test data"
	dataroot = "../data/test_part.csv"

	skulist = predict_testdata(cat_info,dataroot)
	#skulist = predict_testdata_bycat(cat_info,dataroot)
	print skulist
## predict by nb_dic
		
####################
## compute elapsed CPU time
####################
	stop = timeit.default_timer()

	print 'time is', stop - start