def ten_fold_SVM(fold_type, feature_type, if_doc2vec, model_no): # read data (avoid replication) neg_reviews = text.read_data_from_file('neg') pos_reviews = text.read_data_from_file('pos') no_fold, length_data = 10, 1000 if fold_type == 'consecutive': test_ranges = cv.n_fold_cons(no_fold, length_data) else: test_ranges = cv.n_fold_RR(no_fold, length_data) results = list() for i in range(len(test_ranges)): if fold_type == 'consecutive': train_size, test_size, reviews_train, reviews_test = cv.prepare_data_tenfold( neg_reviews, pos_reviews, test_ranges[i]) else: train_size, test_size, reviews_train, reviews_test = cv.prepare_data_roundrobin( neg_reviews, pos_reviews, test_ranges[i]) result = svm.SVM_classifier(feature_type, True, if_doc2vec, model_no, True, train_size, test_size, reviews_train, reviews_test) results.append(result) performances = results # list of accuracies perf_average, variance = np.average(performances), np.var(performances) # save results into file svm.save_results_cv(fold_type, feature_type, if_doc2vec, results, performances, perf_average, variance) print("\ncross validation results written to file")
def ten_fold_NB(fold_type, feature_type): # read data (avoid replication) neg_reviews = text.read_data_from_file('neg') pos_reviews = text.read_data_from_file('pos') no_fold, length_data = 10, 1000 if fold_type == 'consecutive': test_ranges = cv.n_fold_cons(no_fold, length_data) else: test_ranges = cv.n_fold_RR(no_fold, length_data) results = list() for i in range(len(test_ranges)): if fold_type == 'consecutive': train_size, test_size, reviews_train, reviews_test = cv.prepare_data_tenfold( neg_reviews, pos_reviews, test_ranges[i]) else: train_size, test_size, reviews_train, reviews_test = cv.prepare_data_roundrobin( neg_reviews, pos_reviews, test_ranges[i]) result = nb.naive_bayes_classifier(feature_type, 'laplace', True, train_size, test_size, reviews_train, reviews_test) results.append(result) performances = np.array( ([sum(x) / len(x) for x in results])) # list of accuracies perf_average, variance = np.average(performances), np.var(performances) # save results into file nb.save_results_cv(fold_type, feature_type, results, performances, perf_average, variance) print("\ncross validation results written to file")
def test_freq_cutoff(self): neg_texts = text.read_data_from_file('neg') pos_texts = text.read_data_from_file('pos') vocab_unigram = feat.get_vocab(neg_texts+pos_texts,9) vocab_bigram = feat.get_vocab_bigram(neg_texts+pos_texts,14) print(len(vocab_unigram)) print(len(vocab_bigram))
def test_bag_words2vec_bigram_real(self): # take only 10 reviews into test part texts = text.read_data_from_file('neg')[:10] sample_vocab = feat.get_vocab_bigram(texts, 14) mat_feat = feat.bag_words2vec_bigram(sample_vocab, texts) mat_feat_naive = feat.bag_words2vec_bigram_naive(sample_vocab, texts) assert len(sample_vocab) == len(mat_feat[random.randrange(0, 10)]) assert mat_feat_naive.all() == mat_feat.all()
def test_visual_data(self): visual_data(read_data_from_file('neg')) pass
def test_read_data_from_file(self): reviews_neg = read_data_from_file('neg') reviews_pos = read_data_from_file('pos') assert len(reviews_neg) == self.number_reviews assert len(reviews_pos) == self.number_reviews