# Ensure Amazon readers are defined list_of_readers = nle_utils.set_up_readers() number_of_tests = 30 feature_extractor = fe.simple_feature_extractor_stopwords # File to be written tto fo = open("N:\\Downloads\\NLE\\results_simple_classifier_test.txt", "wb") for x in xrange(0, number_of_tests): sys.stdout.write("TEST_NUMBER:" + str(x) + ":") fo.write("TEST_NUMBER:" + str(x) + ":") # Split data into ((+ve training),(-ve training),(+ve testing, -ve testing)) split_data = nle_utils.split_by_classification(list_of_readers, 0.8) # List - tuple - tuple - document # print type(split_data[0][0][0][0]) i = 0 # iterator for domain_split in split_data: # print "\n for a domain " + nle_utils.list_of_amazon_categories[i] + ":" sys.stdout.write("DOMAIN:" + nle_utils.list_of_amazon_categories[i] + ":") fo.write("DOMAIN:" + nle_utils.list_of_amazon_categories[i] + ":") # Detemine Frequency distribution of all words in training data positive and negative. fdists = nle_utils.calculate_training_freq_dists(domain_split, feature_extractor) ### DECISION: top x OR more than x # wordlist_tuple = nle_utils.pos_neg_wordlist(fdists,nle_utils.words_as_frequent_as_x,200) # todo 200, 100 wordlist_tuple = nle_utils.pos_neg_wordlist(fdists, nle_utils.top_x_most_frequent, 100)
feature_extraction = None # The function of feature extraction to be applied global cross_domain_testing_data # Speeding up the cross domain testing by not extracting it each time. cross_domain_testing_data = None # open file to write results out to fo = open("N:\\Downloads\\NLE\\results_naive_bayes_test.txt", "wb") if k_fold == False: # preserving functionality for when k folding isn't used # Main for loop of the experiment for x in xrange(0,number_of_tests): sys.stdout.write('TEST_NUMBER:' + str(x) + ':'), fo.write('TEST_NUMBER:' + str(x) + ':') # Split data into ((+ve training),(-ve training),(+ve testing, -ve testing)) split_data = nle_utils.split_by_classification(list_of_readers,sample_ratio) #List - tuple - tuple - document #print type(split_data[0][0][0][0]) i = 0 # iterator for domain_split in split_data: # if not cross domain if (cross_domain == ""): sys.stdout.write("DOMAIN:" + nle_utils.list_of_amazon_categories[i] + ":") fo.write("DOMAIN:" + nle_utils.list_of_amazon_categories[i] + ":") train_nb_data, test_nb_data = nle_utils.format_for_naive_bayes(domain_split,feature_extraction) # train_nb_data = 1600 reviews when train ratio is 80% # to vary training data sizes uncomment the line below train_nb_data = train_nb_data nb_classifier = NaiveBayesClassifier.train(train_nb_data) sys.stdout.write("ACCURACY:" + str(accuracy(nb_classifier, test_nb_data)) + '\n')