Exemplo n.º 1
0
def main():
	f = open('PCA results.txt', 'w')

	channels = ( 'NDTV', 'TIMESNOW', 'CNNIBN', 'CNN', 'BBC' )
	learn_methods = ({ 'class': KNeighborsClassifier, 'name': 'kNN',
	'params': {'n_neighbors': 5}, 'dense_X': False },
	{ 'class': LDA, 'name': 'LDA', 'params': {}, 'dense_X': True },
	{ 'class': SVC, 'name': 'SVM', 'params': {}, 'dense_X': False },
	{ 'class': RandomForestClassifier, 'name': 'Random forest',
	'params': {'n_estimators': 50}, 'dense_X': False },
	{ 'class': GradientBoostingClassifier,
	'name': 'Gradient tree boosting',
	'params': {'n_estimators': 100}, 'dense_X': True })

	pca_n_features = []
	for i in range(1,21):
	    pca_n_features.append(i*10)
	pca_n_features.append(227)

	timer = clock if platform == 'win32' else time

	for channel in channels:
	    XBig, y = load_svmlight_file('../../Dataset/{}.txt'.format(channel))
	    XBig = VarianceThreshold().fit_transform(XBig)
	    print('Loaded {} dataset...'.format(channel))
	    reduction_rates = []
	    ps_times = []
	    scores = { 'kNN': [], 'LDA': [], 'SVM': [], 'Random forest': [],
	            'Gradient tree boosting': [] }
	    train_times = deepcopy(scores)

	    Xs_reduced = list()

	    print('Starting PCA...')
	    for n_features in pca_n_features:
	    	start_time = timer()
	    	pca = PCA(n_components = n_features)
	    	Xs_reduced.append(pca.fit_transform(XBig.toarray()))
	    	end_time = timer()
	    	ps_times.append(end_time - start_time)
	    print('PCA finished')

	    for X in Xs_reduced:
	    	print('Model dimension = {}'.format(X[0].shape[0]))
	    	f.write('Model dimension = {}'.format(X[0].shape[0]))
	        for train, test in StratifiedKFold(y, 10):
	            X_train, X_test, y_train, y_test = (X[train], X[test],
	                    y[train], y[test])
	            #X_train = X_train.toarray()
	            #X_test = X_test.toarray()
	            # print('Selecting prototypes...')
	            #start_time = timer()
	            #ps = PrototypeSelector(X_train, y_train.astype(np.int))
	            #X_train_red, y_train_red = ps.fcnn_reduce(int(argv[1]))
	            #end_time = timer()
	            #ps_times.append(end_time - start_time)
	            #reduction_rates.append(X_train_red.shape[0] / X_train.shape[0])
	            # print('{}% of {} instances selected in {} s.'.format(
	            # 100 * reduction_rates[-1], X_train.shape[0], ps_times[-1]))

	            for method in learn_methods:
	                method_class = method['class']
	                method_params = method['params']
	                print('Testing with {}...'.format(method['name']))

	                clf = method['class'](**method['params'])
	                #print('Training...')
	                start_time = timer()
	                clf.fit(X_train, y_train)
	                end_time = timer()
	                train_times[method['name']].append(end_time - start_time)

	                #print('Testing...')
	                scores[method['name']].append(clf.score(X_test, y_test))

	        for method in learn_methods:
	            mean_score = np.mean(scores[method['name']])
	            score_variance = np.var(scores[method['name']])
	            mean_train_time = np.mean(train_times[method['name']])
	            train_time_variance = np.var(train_times[method['name']])

	            print('{}, {}: Q = {}±{}, Ttr = {}±{}'.format(
	                channel, method['name'], mean_score, score_variance,
	                mean_train_time, train_time_variance))

	            f.write('{}, {}: Q = {}±{}, Ttr = {}±{}'.format(
	                channel, method['name'], mean_score, score_variance,
	                mean_train_time, train_time_variance))
	f.close()