def svmTrain(datafile, featureNum, fold=10):
    import sys
    train, test = loaddata(datafile)
    row, col = train['counts'].shape
    if col < featureNum:
        featureNum = col
    X_train = train['counts'][:, 0:featureNum]
    y_train = train['labels'][0, :]
    X_test = test['counts'][:, 0:featureNum]
    y_test = test['labels'][0, :]
    tuned_parameters = [  #{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
        #'C': [1, 10, 100, 1000,10000]},
        {
            'C': [1, 10, 100, 1000, 10000]
        }
    ]
    model = LinearSVC(dual=True, tol=1e-3)
    categories = train['category']
    feature_names = np.array([k.strip() for k in train['feature_names']])
    data = [
        X_train, y_train, X_test, y_test, categories, feature_names,
        featureNum, model, tuned_parameters, fold
    ]
    clf, accuracy = cross_validation(*data)
    '''
	for c in range(len(categories)):
		index = np.argsort(clf.best_estimator_.coef_[c])
		if len(index) >= featureNum:
			topfeatures = index[-1*featureNum:]
		else:
			topfeatures = index
		print('%s:%s'%(categories[c], ' '.join(feature_names[topfeatures])))
	'''
    return accuracy
def NBTrain(datafile, featureNum, fold=10):
    import sys
    train, test = loaddata(datafile)
    row, col = train['counts'].shape
    if col < featureNum:
        featureNum = col
    X_train = train['counts'][:, 0:featureNum]
    y_train = train['labels'][0, :]
    X_test = test['counts'][:, 0:featureNum]
    y_test = test['labels'][0, :]
    tuned_parameters = [{'alpha': [0.01, 0.05, 1, 2, 5]}]
    model = MultinomialNB(fit_prior=True)
    categories = train['category']
    feature_names = np.array([k.strip() for k in train['feature_names']])
    data = [
        X_train, y_train, X_test, y_test, categories, feature_names,
        featureNum, model, tuned_parameters, fold
    ]
    clf, accuracy = cross_validation(*data)
    for c in range(len(categories)):
        index = np.argsort(clf.best_estimator_.coef_[c])
        if len(index) >= featureNum:
            topfeatures = index[-1 * featureNum:]
        else:
            topfeatures = index
        print('%s:%s' % (categories[c], ' '.join(feature_names[topfeatures])))
    return accuracy
def svmTrain(datafile,featureNum,fold = 10):
	import sys
	train,test = loaddata(datafile)
	row,col = train['counts'].shape
	if col < featureNum:
		featureNum = col 
	X_train = train['counts'][:,0:featureNum]
	y_train = train['labels'][0,:]
	X_test = test['counts'][:,0:featureNum]
	y_test = test['labels'][0,:]
	tuned_parameters = [#{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     #'C': [1, 10, 100, 1000,10000]},
                    {'C': [1, 10, 100, 1000,10000]}]
	model = LinearSVC(dual = True,tol=1e-3)	 
	categories = train['category']
	feature_names =np.array([k.strip() for k in train['feature_names']])
	data = [X_train,y_train,X_test,y_test,categories,feature_names,featureNum,model,tuned_parameters,fold]
	clf,accuracy = cross_validation(*data)
	'''
	for c in range(len(categories)):
		index = np.argsort(clf.best_estimator_.coef_[c])
		if len(index) >= featureNum:
			topfeatures = index[-1*featureNum:]
		else:
			topfeatures = index
		print('%s:%s'%(categories[c], ' '.join(feature_names[topfeatures])))
	'''
	return accuracy
def preprocess(prefix,totalfeatures):
	train,test = loaddata(prefix)
	X_train,X_test,feature_names=featureselection((train,test),totalfeatures)
	trainoutput = prefix + '_chi_train.mat'
	testoutput = prefix + '_chi_test.mat'
	
	train['counts'] = X_train
	train['feature_names'] = feature_names
	io.savemat(trainoutput,train)
	test['counts'] = X_test
	test['feature_names'] = feature_names
	print(feature_names)
	io.savemat(testoutput,test)
	return prefix+'_chi'	
def preprocess(prefix, totalfeatures):
    train, test = loaddata(prefix)
    X_train, X_test, feature_names = featureselection((train, test),
                                                      totalfeatures)
    trainoutput = prefix + '_chi_train.mat'
    testoutput = prefix + '_chi_test.mat'

    train['counts'] = X_train
    train['feature_names'] = feature_names
    io.savemat(trainoutput, train)
    test['counts'] = X_test
    test['feature_names'] = feature_names
    print(feature_names)
    io.savemat(testoutput, test)
    return prefix + '_chi'
def knnTrain(datafile,featureNum,fold = 10):
	import sys
	train,test = loaddata(datafile)
	row,col = train['counts'].shape
	if col < featureNum:
		featureNum = col 
	X_train = train['counts'][:,0:featureNum]
	y_train = train['labels'][0,:]
	X_test = test['counts'][:,0:featureNum]
	y_test = test['labels'][0,:]
	tuned_parameters = [{'n_neighbors':[2,3,4,6,10,15,18,20,30,40,50]}]
	model = knn(n_neighbors = 1)	 
	categories = train['category']
	feature_names =np.array([k.strip() for k in train['feature_names']])
	data = [X_train,y_train,X_test,y_test,categories,feature_names,featureNum,model,tuned_parameters,fold]
	clf,accuracy = cross_validation(*data)
	return accuracy
def knnTrain(datafile, featureNum, fold=10):
    import sys
    train, test = loaddata(datafile)
    row, col = train['counts'].shape
    if col < featureNum:
        featureNum = col
    X_train = train['counts'][:, 0:featureNum]
    y_train = train['labels'][0, :]
    X_test = test['counts'][:, 0:featureNum]
    y_test = test['labels'][0, :]
    tuned_parameters = [{
        'n_neighbors': [2, 3, 4, 6, 10, 15, 18, 20, 30, 40, 50]
    }]
    model = knn(n_neighbors=1)
    categories = train['category']
    feature_names = np.array([k.strip() for k in train['feature_names']])
    data = [
        X_train, y_train, X_test, y_test, categories, feature_names,
        featureNum, model, tuned_parameters, fold
    ]
    clf, accuracy = cross_validation(*data)
    return accuracy
Пример #8
0
def pca_pipeline(prefix, n_components, model):
    train, test = loaddata(prefix)
    X_train = train['counts']
    X_test = test['counts']
    pca_analysis(X_train, X_test, n_components)
    X_train, X_test, pcs = pca_analysis(X_train, X_test, n_components)
    train['counts'] = X_train

    test['counts'] = X_test

    # setup the categories

    name = 'pc_'
    categories = []
    for i in range(n_components):
        categories.append(name + str(i))
    train['category'] = categories
    test['category'] = categories
    outprefix = prefix + '_pca'
    io.savemat(outprefix + '_train.mat', train)
    io.savemat(outprefix + '_test.mat', test)
    #svmTrain(outprefix,n_components)
    accuracy = model(outprefix, n_components)
    return accuracy
def pca_pipeline(prefix,n_components,model):
	train,test = loaddata(prefix)
	X_train = train['counts']
	X_test = test['counts']
	pca_analysis(X_train,X_test,n_components)	
	X_train,X_test,pcs = pca_analysis(X_train,X_test,n_components)
	train['counts'] = X_train

	test['counts'] = X_test

	# setup the categories

	name = 'pc_'
	categories = []
	for i in range(n_components):
		categories.append(name + str(i))
	train['category'] = categories
	test['category'] = categories		
	outprefix = prefix + '_pca'
	io.savemat(outprefix + '_train.mat',train)
	io.savemat(outprefix + '_test.mat',test)
	#svmTrain(outprefix,n_components)	
	accuracy = model(outprefix,n_components)	
	return accuracy
def NBTrain(datafile,featureNum,fold = 10):
	import sys
	train,test = loaddata(datafile)
	row,col = train['counts'].shape
	if col < featureNum:
		featureNum = col 
	X_train = train['counts'][:,0:featureNum]
	y_train = train['labels'][0,:]
	X_test = test['counts'][:,0:featureNum]
	y_test = test['labels'][0,:]
	tuned_parameters = [{'alpha':[0.01,0.05,1,2,5]}]
	model = MultinomialNB(fit_prior=True)	 
	categories = train['category']
	feature_names =np.array([k.strip() for k in train['feature_names']])
	data = [X_train,y_train,X_test,y_test,categories,feature_names,featureNum,model,tuned_parameters,fold]
	clf,accuracy = cross_validation(*data)
	for c in range(len(categories)):
		index = np.argsort(clf.best_estimator_.coef_[c])
		if len(index) >= featureNum:
			topfeatures = index[-1*featureNum:]
		else:
			topfeatures = index
		print('%s:%s'%(categories[c], ' '.join(feature_names[topfeatures])))
	return accuracy
Пример #11
0
#		print(choice.shape,remaining.shape,split.shape)
		if first:
			#labeled = np.concatenate((labeled,data[choice,:]),axis = 0)
			labeled = vstack((labeled,data[choice,:]))
			unlabeled =vstack((unlabeled,data[remaining,:]))
			y_labeled = np.concatenate((y_labeled,label[choice]))
			y_unlabeled = np.concatenate((y_unlabeled,label[remaining]))
		else:
			labeled = data[choice,:]
			unlabeled = data[remaining,:]	
			y_labeled = label[choice]
			y_unlabeled = label[remaining]			
			first=True	
	return ((labeled,y_labeled),(unlabeled,y_unlabeled))	
		
if __name__ == '__main__':
	np.random.seed(511)
	snb = SemiNB()
	prefix = '../features/bagofword'
	data = loaddata(prefix)
	print(data[0]['counts'].shape,data[1]['counts'].shape)
	labeled,unlabeled = splitDataByClass(data[0]['counts'],data[0]['labels'][0,:],0.5)
	td,delta =dataTransformation(labeled[0],labeled[1])
	print(td.shape,delta.shape)	 
	snb.train(td,delta)
	test_td,test_delta = dataTransformation(data[1]['counts'],data[1]['labels'])
	print(test_td.shape)
	result = snb.predict_all(np.transpose(test_td)	)
	print(result)