Exemplo n.º 1
0
def run_classifier(X_train, y_train, X_test, y_test, clf_name, num_trees):
    accu = 0.0
    roc = 0.0
    predicted = []
    clf = ''
    if clf_name == "BernoulliNB":
        clf = BernoulliNB().fit(X_train, y_train)
    if clf_name == "GaussianNB":
        clf = GaussianNB().fit(X_train, y_train)
    if clf_name == "RF":
        clf = RandomForestClassifier(n_estimators=num_trees)
        clf = clf.fit(X_train, y_train)
    if clf_name == "SVM":
        clf = SVC(cache_size=2000, probability=False)
        clf.fit(X_train, y_train)
    if clf_name == "KNN":
        n_neighbors = 5  # default is 5
        clf = neighbors.KNeighborsClassifier(n_jobs=-1)
        clf.fit(X_train, y_train)

    predicted = clf.predict(X_test)
    if clf_name == "SVM":
        predicted_prob = clf.decision_function(X_test)
        accu = accuracy_score(y_test, predicted)
        roc = roc_auc_score(y_test, predicted_prob)
    else:
        predicted_prob = clf.predict_proba(X_test)
        accu = accuracy_score(y_test, predicted)
        roc = roc_auc_score(y_test, predicted_prob[:, 1])
    pos_presicion = precision_score(y_test, predicted)
    pos_recall = recall_score(y_test, predicted)
    pos_f1 = f1_score(y_test, predicted)
    print("Correctly Classified: {}".format(accu))
    print(classification_report(y_test, predicted, digits=4))

    return accu, roc, pos_presicion, pos_recall, pos_f1
Exemplo n.º 2
0
def main():
	##### DO NOT MODIFY THESE OPTIONS ##########################
	parser = argparse.ArgumentParser()
	parser.add_argument('-training', required=True, help='Path to training data')
	parser.add_argument('-business_file', required=True, help='Path to business data')
	parser.add_argument('-c', '--classifier', default='nb', help='nb | log | svm')
	parser.add_argument('-top', type=int, help='Number of top features to show')
	parser.add_argument('-test', help='Path to test data')
	opts = parser.parse_args()
	############################################################

	##### BUILD TRAINING SET ###################################
	# Initialize CountVectorizer
	# You will need to implement functions in tokenizer.py
	tokenizer = Tokenizer()
	vectorizer = CountVectorizer(binary=True, lowercase=True, decode_error='replace', tokenizer=tokenizer)
	csv_file = open(opts.training)
	file_reader = csv.reader(csv_file)
	tweets = []
	lable = []
	for line in file_reader:
		tweets.append(line[2])
		lable.append(int(line[1]))
	vocabulary = vectorizer.fit_transform(tweets)
	#print tweets
	lable = np.array(lable)
	#print lable
	# Load training text and training labels
	# (make sure that your labels are converted to integers (0 or 1, not '0' or '1') 
	#  so that we can enforce the condition that label data is binary)

	# Get training features using vectorizer
	
	# Transform training labels to numpy array (numpy.array)
	
	############################################################
	
	##### TRAIN THE MODEL ######################################
	# Initialize the corresponding type of the classifier and train it (using 'fit')
	if opts.classifier == 'nb':
		classifier = BernoulliNB(binarize=None)
		classifier.fit(vocabulary, lable)
	elif opts.classifier == 'log':
		classifier = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None)
		classifier.fit(vocabulary, lable)
	elif opts.classifier == 'svm':
		classifier = LinearSVC(penalty='l2', loss='l2', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None)
		classifier.fit(vocabulary, lable)
	else:
		raise Exception('Unrecognized classifier!')
	############################################################
	
	###### VALIDATE THE MODEL ##################################
	# Print training mean accuracy using 'score'
	print ("Training accuracy: %f" % classifier.score(vocabulary, lable))
	# Perform 10 fold cross validation (cross_validation.cross_val_score) with scoring='accuracy'
	# and print the mean score and std deviation
	scores = cross_validation.cross_val_score(classifier, vocabulary, lable, scoring = 'accuracy', cv=10)
	print("Cross-Validation Accuracy: %f (+/- %f)" % (scores.mean(), scores.std()))
	
	############################################################	

	##### EXAMINE THE MODEL ####################################
	if opts.top is not None:
		# print top n most informative features for positive and negative classes
		print 'Most informative features'
		util.print_most_informative_features(opts.classifier, vectorizer, classifier, opts.top)
	############################################################
	

	##### TEST THE MODEL #######################################
	if opts.test is None:
		# Test the classifier on one sample test tweet
		# Tim Kraska 10:43 AM - 5 Feb 13
		test_tweet = 'Water dripping from 3rd to 1st floor while the firealarm makes it hard to hear anything. BTW this is the 2nd leakage.  Love our new house'
		
		terms = vectorizer.transform([test_tweet])

		# Print the predicted label of the test tweet
		print classifier.predict(terms)
		# Print the predicted probability of each label.
		if opts.classifier != 'svm':
			# Use predict_proba
			print classifier.predict_proba(terms)
		else:
			# Use decision_funcion
			print classifier.decision_function(terms)
	else:
		# Test the classifier on the given test set
		# Extract features from the test set and transform it using vectorizer
		csv_file = open(opts.test)
		file_reader = csv.reader(csv_file)
		test_tweets = []
		true_lable = []
		business = []
		for line in file_reader:
			business.append(line[0])
			test_tweets.append(line[2])
			true_lable.append(int(line[1]))
		terms = vectorizer.transform(test_tweets)
		true_lable = np.array(true_lable)
		predict_lable = classifier.predict(terms)
		# Print test mean accuracy
		accuracy = (len(true_lable) - sum(true_lable^predict_lable))/len(true_lable)
		print ("Test accuracy: %f" % accuracy)
		# Predict labels for the test set
		
		# Print the classification report
		target_names = ['Negative', 'Positive']

		if opts.classifier != 'svm':
			test_predicted_proba = classifier.predict_proba(terms)
			util.plot_roc_curve(true_lable, test_predicted_proba)

			positive_prob = []
			negative_prob = []
			for i, item in enumerate(true_lable):
				if true_lable[i] == 1:
					positive_prob.append([i, test_predicted_proba[i][0], test_predicted_proba[i][1]])
				else:
					negative_prob.append([i, test_predicted_proba[i][0], test_predicted_proba[i][1]])
			sorted_positive = sorted(positive_prob, key=itemgetter(1), reverse= True)
			positive_bias = sorted_positive[0:100]
			sorted_negative = sorted(negative_prob, key=itemgetter(1))
			negative_bias = sorted_negative[0:100]

			bfile = open(opts.business_file, 'r')
			bdic = {}
			for line in bfile:
				line = json.loads(line)
				bdic[line['business_id']] = [line['name'], line['full_address']]
			positive = open('positive_bias.csv', 'w')
			writer_positive = csv.writer(positive)
			negative = open('negative_bias.csv', 'w')
			writer_negative = csv.writer(negative)
			for item in positive_bias:
				writer_positive.writerow((bdic[business[item[0]]][0], bdic[business[item[0]]][1]))
			for item in negative_bias:
				writer_negative.writerow((bdic[business[item[0]]][0], bdic[business[item[0]]][1]))

		'''
        clf = GradientBoostingClassifier(n_estimators=5, random_state=0)
        fs_train = fs_train.toarray()
        fs_test = fs_test.toarray()

    if config.SELF_TRAINING:
        fl = fs_train.shape[0]
        ll = labels_train.shape[0]
        fsarr = fs_train.toarray()
        cur_fs = fsarr[:fl / 10]
        cur_labels = labels_train[:ll / 10]

        clf.fit(cur_fs, cur_labels)
        print clf.classes_
        for i in range(1, 10):
            new_fs = fsarr[(i * fl) / 10:((i + 1) * fl) / 10]
            confidence_scores = clf.decision_function(new_fs)
            most_confident_samples = confidence_scores.max(
                axis=1).argsort()[-1 * (confidence_scores.shape[0] / 10):]
            most_confident_labels = confidence_scores[
                most_confident_samples].argmax(axis=1)
            cur_fs = np.append(cur_fs, new_fs[most_confident_samples], axis=0)
            cur_labels = np.append(cur_labels,
                                   clf.classes_[most_confident_labels])
            clf.fit(cur_fs, cur_labels)
        pred = clf.predict(fs_test)

    else:
        clf.fit(fs_train, labels_train)
        pred = clf.predict(fs_test)

    if grid_search:
Exemplo n.º 4
0
        fs_train = fs_train.toarray()
        fs_test = fs_test.toarray()


    if config.SELF_TRAINING:
        fl = fs_train.shape[0]
        ll = labels_train.shape[0]
        fsarr = fs_train.toarray()
        cur_fs = fsarr[:fl / 10]
        cur_labels = labels_train[:ll / 10]

        clf.fit(cur_fs, cur_labels)
        print clf.classes_
        for i in range(1, 10):
            new_fs = fsarr[(i * fl) / 10:((i + 1) * fl) / 10]
            confidence_scores = clf.decision_function(new_fs)
            most_confident_samples = confidence_scores.max(axis=1).argsort()[
                -1 * (confidence_scores.shape[0] / 10):]
            most_confident_labels = confidence_scores[most_confident_samples].argmax(axis=1)
            cur_fs = np.append(cur_fs, new_fs[most_confident_samples], axis=0)
            cur_labels = np.append(cur_labels, clf.classes_[most_confident_labels])
            clf.fit(cur_fs, cur_labels)
        pred = clf.predict(fs_test)


    else:
        clf.fit(fs_train, labels_train)
        pred = clf.predict(fs_test)

    if grid_search:
        print clf.best_estimator_
Exemplo n.º 5
0
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('-training', required=True, help='Path to training data')
	parser.add_argument('-test', help='Path to test data')
	parser.add_argument('-c', '--classifier', default='nb', help='nb | log | svm')
	parser.add_argument('-top', type=int, help='Number of top features to show')
	parser.add_argument('-trees',type=int,help="Number of trees (if random forest for classifier)")
	opts = parser.parse_args()

	##### BUILD TRAINING SET ###################################
	# Initialize CountVectorizer
	vectorizer = CountVectorizer(binary=True, lowercase=True, decode_error='replace')

	# Load training text and training labels
	# (make sure to convert labels to integers (0 or 1, not '0' or '1')
	#  so that we can enforce the condition that label data is binary)

	count = 0
	with open(opts.training, 'rU') as f:
		reader = csv.reader(f)
		train_data = list(reader)

	train_labels = numpy.arange(len(train_data))
	train_text = []


	i = 0
	for blog in train_data:
		label = blog[0]
		text = blog[1]

		train_text.append(text)
		train_labels[i] = int(label)
		i+=1

	print("ready to vectorize training data")
	# Get training features using vectorizer
	train_features = vectorizer.fit_transform(train_text)
	# Transform training labels to numpy array (numpy.array)
	print("done vectorizing")
	############################################################


	##### TRAIN THE MODEL ######################################
	# Initialize the corresponding type of the classifier and train it (using 'fit')
	if opts.classifier == 'nb':
		classifier = BernoulliNB(binarize=None)
		print("Naive Bayes")
	elif opts.classifier == 'log':
		classifier = LogisticRegression(C=.088)
		print("Log")
	elif opts.classifier == 'svm':
		classifier = LinearSVC()
		print("Support Vector Machine")
	elif opts.classifier == 'rf':
		if not opts.trees:
			trees = 10
		else:
			trees = opts.trees
		classifier = RandomForestClassifier(n_estimators=trees)
		train_features = train_features.toarray()
	elif opts.classifier == 'knn':
		classifier = KNeighborsClassifier(n_neighbors=10)
	else:
		raise Exception('Unrecognized classifier!')
	classifier.fit(train_features,train_labels)
	############################################################


	###### VALIDATE THE MODEL ##################################
	# Print training mean accuracy using 'score'
	print(classifier.score(train_features,train_labels))
	scores = cross_validation.cross_val_score(classifier,train_features,train_labels,cv=10,scoring='accuracy')
	print("Cross Validation Scores Calculated")
	print(scores)
	print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
	############################################################


	##### EXAMINE THE MODEL ####################################
	if opts.top is not None:
		print("Got "+str(opts.top)+" tops")

		# print top n most informative features for positive and negative classes
		util.print_most_informative_features(opts.classifier, vectorizer, classifier, opts.top)
	############################################################


	##### TEST THE MODEL #######################################
	if opts.test is None:
		test_blog = "uses yahoo boss support search experience general web search perform query application set term candidates using key terms term its within result set its global measure similar 1ST_PERSON former colleagues 1ST_PERSON enterprise try yourself URL rough edges produces considering example 1ST_PERSON application explore learn 1ST_PERSON started 1ST_PERSON term 1ST_PERSON suggestions looked name caught 1ST_PERSON following 1ST_PERSON 1ST_PERSON again results 1ST_PERSON immediately had document further made clear someone 1ST_PERSON get home can_t you_ll experience 1ST_PERSON did 1ST_PERSON encourage"
		# Print the predicted label of the test blog
		features = vectorizer.transform([test_blog])

		if opts.classifier == 'rf':
			features = features.toarray()

		print("Prediction (1 == correct): ")
		print(classifier.predict(features))
		# Print the predicted probability of each label.
		if opts.classifier != 'svm':
			# Use predict_proba
			print("User predict prob ")
			print(classifier.predict_proba(features))

		else:
			# Use decision_function
			print("use decision ")
			print(classifier.decision_function(features))

	else:
		with open(opts.test, 'rb') as f:
			reader = csv.reader(f)
			test_data = list(reader)

		test_labels = numpy.arange(len(test_data))
		test_text = []


		i = 0
		for blog in test_data:
			label = blog[0]
			text = blog[-1]

			test_text.append(text)
			test_labels[i] = int(label)
			i+=1

		print("ready to vectorize testing data")
		# Get training features using vectorizer
		test_features = vectorizer.transform(test_text)

		print("Score")
		print(classifier.score(test_features,test_labels))

		# Test the classifier on the given test set
		# Extract features from the test set and transform it using vectorizer

		# Print test mean accuracy

		# Predict labels for the test set
		predictions = classifier.predict(test_features)

		# Print the classification report
		print("Classification report")
		print(classification_report(test_labels,predictions))
		# Print the confusion matrix
		print("Classifier uses: Confusion!")
		print(confusion_matrix(test_labels,predictions))
		print("It's super effective!")

		# Get predicted label of the test set
		if opts.classifier != 'svm':
			print("Predicted Probability")
			test_predicted_proba = classifier.predict_proba(test_features)


			blogs = zip(test_labels,predictions,test_predicted_proba,test_text)
			num = len(blogs)
			counter = 0
			"""for tup in reversed(sorted(blogs,key=lambda x:x[2][1])):

				if tup[0] == tup[1]:
					if counter < 5:
						print(tup)
					counter+=1
			counter = 0
			for tup in reversed(sorted(blogs,key=lambda x:x[2][0])):
				if tup[0] == tup[1]:
					if counter < 5:
						print(tup)
					counter+=1"""

			util.plot_roc_curve(test_labels, test_predicted_proba)


		else:
			print("Decision Function")
			decisions = classifier.decision_function(test_features)
			#import matplotlib.pyplot as plt
			x = numpy.arange(0,len(decisions),1)
			plt.plot(x,decisions)
			plt.show()