Пример #1
0
    def predict(self, X: Union[DataFrame, Series]) -> Union[Series, Any]:
        """
        Predict y value for a set of instances

        :param X: set of instances which y value wants to be predicted (use pandas Series for individual instances)
        :return: returns the list of predicted values for each instance in X (if X is just an individual instance it just
         returns its predicted value)
        """
        try:
            getattr(self, "X_")
            (X_train_aux, target_attribute) = _generate_train_set(self.X_, self.y_, self.target_attribute)
            # noinspection PyPep8Naming
            X_train: DataSet = DataSet()
            X_train.load_from_pandas_df(X_train_aux, self.attribute_info, self.attribute_list)
        except AttributeError:
            raise RuntimeError("You must train classifier before predicting data!")

        # X is a set
        if X.__class__ == DataFrame:
            y = X.apply(lambda row: naive_bayes_classifier(X_train, row, self.target_attribute), axis=1)

        # X is an instance
        else:
            y = naive_bayes_classifier(X_train, X, self.target_attribute)

        return y
Пример #2
0
def cross_validation_n_1(setbuilt, n):
	allset = setbuilt[0]
	classlist = setbuilt[1]
	vasize = len(allset) // n;
	for i in range(0, len(allset), vasize):
		validset = allset[i:i+vasize-1]
		trainset = [sam for sam in allset if sam not in validset]
		nbcc = nbc.naive_bayes_classifier(classlist, trainset, validset)
		nbcc.train()
		print 'rate of %d patch: %f' % (i // vasize, nbcc.validate())
Пример #3
0
def cross_validation_n_1(setbuilt, n):
    allset = setbuilt[0]
    classlist = setbuilt[1]
    vasize = len(allset) // n
    for i in range(0, len(allset), vasize):
        validset = allset[i:i + vasize - 1]
        trainset = [sam for sam in allset if sam not in validset]
        nbcc = nbc.naive_bayes_classifier(classlist, trainset, validset)
        nbcc.train()
        print 'rate of %d patch: %f' % (i // vasize, nbcc.validate())
Пример #4
0
def fitness(init_population, population_size, num_features, dataset):
    best_accuracy = 0
    best_solution = [0 for i in range(num_features)]
    fitness_list = []
    for row in init_population:
        accuracy = nb.naive_bayes_classifier(dataset, num_features, row)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_solution = row
        fitness_list.append(accuracy)
    return fitness_list, best_accuracy, best_solution
import naive_bayes_classifier as nb

(script_name,model_file,test_file)=argv
prior_word_conditional_prob_hash=pickle.load(open(model_file,"rb"))
feature_rank_hash=pickle.load(open("feature_ranks.p","rb"))
prior_probabilities=pickle.load(open("prior_class_probabilities.p","rb"))
count=-1
with open(test_file, 'rb') as csvfile:
     review_reader = csv.reader(csvfile, delimiter='\t', quotechar='"')
     #review_reader.next()

     for row in review_reader:
         count=count+1
         if count==0:
            print "Id,Category"
            continue

         #rows=row[0].split(",",1)
         review_content=""
         #if len(rows)==2:
         #    review_content=rows[1]
         #elif len(rows)==1:
         #    review_content=rows[0]
         review_content=row[0]
         words=preprocessor.pre_process_sentence(review_content)
         negative_words=pickle.load(open("negative_words.p","rb"))
         positive_words=pickle.load(open("positive_words.p","rb"))
         class_data=nb.naive_bayes_classifier(prior_probabilities,prior_word_conditional_prob_hash,words,positive_words,negative_words )
         #print str(count)+","+class_data.keys()[0]
         print class_data.keys()[0]+","+review_content
         #break
Пример #6
0
for i in range(0, len(y_train)):
    # y_train[i] is the index of the class_vector element we need to append
    class_vector_train.append(class_vector[int(y_train[i])])

for i in range(0, len(y_test)):
    # y_test[i] is the index of the class_vector element we need to append
    class_vector_test.append(class_vector[int(y_test[i])])

print('class_vector_train: ', class_vector_train)
print('class_vector_test: ', class_vector_test)

print('Proportion of dataset selected to be included in test dataset: ', test_size)

# Bayes classifier
class_label_vector_bayes = bayes_classifier(X_train, class_vector_train, X_test)
# Naive Bayes classifier
class_label_vector_naive = naive_bayes_classifier(X_train, class_vector_train, X_test)

# Computation of accuracies of each method
accuracy_Bayes = compare_class_vectors(class_label_vector_bayes, class_vector_test)
accuracy_Naive_Bayes = compare_class_vectors(class_label_vector_naive, class_vector_test)

print('class_vector_test: ', class_vector_test)
print('class_label_vector Bayes: ', class_label_vector_bayes)
print('class_label_vector Naive Bayes: ', class_label_vector_naive)

print('Accuracy of Bayes is: ', accuracy_Bayes)
print('Accuracy of Naive Bayes is: ', accuracy_Naive_Bayes)

Пример #7
0
import pandas as p
from naive_bayes_classifier import naive_bayes_classifier

train = p.read_csv("trainBruno.csv")
test = p.read_csv("testBruno.csv")

#print(train)

machine = naive_bayes_classifier()
machine.train(train)

#machine.predict(1,1)
predict = machine.test(test)
#predict = machine.crossvalidation()
print(predict)
Пример #8
0
            index = random.randint(0, len_chromosome - 1)
            chromosome[index] = (chromosome[index] + 1) % 2


def geneticAlgo(population_size, num_features, dataset):
    # First ---> Evaluate fitness
    for i in range(NUM_ITERATIONS):
        init_population = [[random.randint(0, 1) for i in range(num_features)]
                           for j in range(population_size)]
        fitness_list, best_accuracy, best_solution = fitness(
            init_population, population_size, num_features, dataset)
        selected_indices = selection(fitness_list, dataset, population_size)
        new_population = [init_population[i][:] for i in selected_indices]
        crossover(new_population)
        mutation(new_population)
    return best_accuracy, best_solution


if __name__ == '__main__':
    print("Enter name of dataset: ")
    dataset_name = input()
    dataset = pandas.read_csv(dataset_name)
    num_features = len(dataset.columns) - 1
    unmodified = [1 for i in range(num_features)
                  ]  # Since the last column is the output column
    print("Naive Bayes mean accuracy without feature selection: ",
          nb.naive_bayes_classifier(dataset, num_features, unmodified))
    print("Naive Bayes mean accuracy with feature selection: ",
          geneticAlgo(30, num_features, dataset))
    population_size = 30