示例#1
0
 def getAccuracyRate(self):
     if self.accuracy == 0:
         naiveBayes = NaiveBayesClassifier()
         naiveBayes.word_features = self.chromosome
         classifier = naiveBayes.NaiveBayes(self.chromosomeManager.sentences)
         self.accuracy = naiveBayes.getAccuracy(classifier, self.chromosomeManager.sentences)
     return self.accuracy
def main():
    print("Number of processors: ", mp.cpu_count())
    pool = mp.Pool(mp.cpu_count())  # for parallel processing
    df = pd.DataFrame(
        columns=['comment', 'otherMetadata', 'likeDislikeRatio', 'sentiment'])

    i = 0
    for x in range(751):
        with open("../output/" + str(x) + ".json", encoding='utf-8') as f:
            data = json.load(f)
            # global commentsText
            commentsText = ""
            # tags = data[0]["tags"]
            sentiment = data[0]['sentiment']
            if 'tags' in data[0].keys():
                tags = str(' '.join(data[0]["tags"]))
                # print("tags" + tags)
                otherMetaData = pr.process(data[0]["title"] + " " +
                                           data[0]["description"] + " " + tags)
            else:
                otherMetaData = pr.process(data[0]["title"] + " " +
                                           data[0]["description"])

            likes = int(data[0]["likeCount"])
            dislikes = int(data[0]["dislikeCount"])
            likeDislikeRatio = str(float(likes / dislikes))
            results = pool.map(preprocess,
                               [item for item in data[0]["comments"]])
            for result in results:
                if result is not None:
                    commentsText += result[0]
            df.loc[i] = [commentsText] + [otherMetaData] + [likeDislikeRatio
                                                            ] + [sentiment]
            # print(str(i)+" : "+df['posToNegCommentRatio'].loc[i])

        # print(df['otherMetadata'].iloc[0])
        print(i)
        i += 1

    df['sentiment_one_hot'] = df['sentiment'].apply(lambda x: 0
                                                    if x == 'N' else 1)

    df['data'] = df['comment'] + ' ' + df['otherMetadata']

    traindf, testdf = train_test_split(df, test_size=0.2)

    x_train, x_test, y_train, y_test = train_test_split(
        df['data'], df['sentiment_one_hot'], test_size=0.2)

    NBModel = NaiveBayesClassifier()
    NBModel.train(x_train, y_train, alpha=1)

    print(y_test)
    # hateVideoComments = df.loc[18]['comment']

    # print(hateVideoComments)
    levelOfHate = NBModel.getHateLevel(x_test)
    print(levelOfHate)
示例#3
0
文件: runNB.py 项目: alejmest/CS6375
def main(use_stop_words):
    bc = NaiveBayesClassifier(use_stop_words)
    spam_total_count, ham_total_count = bc.train()

    spam_success_ratio, ham_success_ratio, total_success_ratio = bc.classify(
        spam_total_count, ham_total_count)
    print('Success Ratio For Spam Emails: %.4f%%' % (spam_success_ratio * 100))
    print('Success Ratio For Ham Emails: %.4f%%' % (ham_success_ratio * 100))
    print('Success Ratio For All Emails: %.4f%%' % (total_success_ratio * 100))
示例#4
0
    def __init__(self):
        self.NBC = NaiveBayesClassifier()
        self.LRC = LogisticRegression()
        abbreviations = []

        self.abbreviations = abbreviations[1:]
        self.punctuations1 = ['!', '?', ',', '"', '(', ')']  #Always token
        self.punctuations2 = ['.', ':']  #Ambiguous
        self.split_characters = [' ']
示例#5
0
def main():
    x_train, y_train = load_data('data/train.txt')
    x_validation, y_validation = load_data('data/validation.txt')

    number_of_words = 5000
    classifier = NaiveBayesClassifier(number_of_words)

    print("Training classifier...")
    classifier.train(x_train, y_train)
    print("Testing classifier...")
    print('Accuracy:',
          classifier.get_accuracy(x_validation, y_validation) * 100)
    def __init__(self):
        self.NBC = NaiveBayesClassifier()
        self.LRC = LogisticRegression()
        nbp = open(PATH + "/" + 'non_breaking_prefixes_tr.txt',
                   mode='r',
                   encoding='utf-8').readlines()
        abbreviations = []
        for line in nbp:
            if line.strip() != '' and line.strip()[0] != "#":
                abbreviations.append(line.strip())

        self.abbreviations = abbreviations[1:]
        self.sentence_ending_punctuations = ['.', '!', '?']
        self.quotation_space_starters = ['"', "("]
        self.quotation_space_enders = ['"', ")"]
示例#7
0
 def testIris(self):
     from NaiveBayesClassifier import NaiveBayesClassifier
     iris = load_iris()
     X_train = iris.data
     y_train = iris.target
     nb = NaiveBayesClassifier(smoothing=True)
     accuracy = np.mean(cross_validation(nb, X_train, y_train))
     print("NaiveBayesClassifier with Laplacian correction: accuracy:",
           accuracy)
示例#8
0
 def __init__(self, vocabulary, n, delta):
     self.vocabulary = vocabulary
     self.n = n
     self.delta = delta
     self.model_ca = NaiveBayesClassifier(
         vocabulary, n, delta, "./training_files/ca_training-tweets.txt",
         "ca", 18318)
     self.model_gl = NaiveBayesClassifier(
         vocabulary, n, delta, "./training_files/gl_training-tweets.txt",
         "gl", 18318)
     self.model_en = NaiveBayesClassifier(
         vocabulary, n, delta, "./training_files/en_training-tweets.txt",
         "en", 18318)
     self.model_es = NaiveBayesClassifier(
         vocabulary, n, delta, "./training_files/es_training-tweets.txt",
         "es", 18318)
     self.model_pt = NaiveBayesClassifier(
         vocabulary, n, delta, "./training_files/pt_training-tweets.txt",
         "pt", 18318)
     self.model_eu = NaiveBayesClassifier(
         vocabulary, n, delta, "./training_files/eu_training-tweets.txt",
         "eu", 18318)
     self.arrayModel = [
         self.model_ca, self.model_gl, self.model_en, self.model_es,
         self.model_pt, self.model_eu
     ]
     self.probability = 0
     self.totalTweetCount = 18318
示例#9
0
    def train(self):
        self._neg_train_data = self.get_training_data(self.data_root + 'train\\neg\\')
        self._pos_train_data = self.get_training_data(self.data_root + 'train\\pos\\')

        print ("Preparing Train Data... ")
        documents = [(list(self._neg_train_data.words(fileid)), "neg")
                     for fileid in self._neg_train_data.fileids()]
        
        documents += [(list(self._pos_train_data.words(fileid)), "pos")
                     for fileid in self._pos_train_data.fileids()]

        print ("Preparing Train FeatureSets... ")
        featuresets = [(self.extract_feature(d), c)  for d, c in documents]
        
        print ("Training... ")
        self.classifier = NaiveBayesClassifier.train(featuresets)
示例#10
0
    def __init__(self):

        self.NaiveBayesClassifier = NaiveBayesClassifier()

        # Sentence Splitters
        self.RuleBasedSentenceSplitter = RuleBasedSentenceSplitter()
        self.MLBasedSentenceSplitter = MLBasedSentenceSplitter()

        # Tokenizers
        self.RuleBasedTokenizer = RuleBasedTokenizer()
        self.MLBasedTokenizer = MLBasedTokenizer()

        # Normalizer
        self.Normalizer = Normalizer()

        # Stemmer
        self.Stemmer = Stemmer()

        # Stopword Eliminators
        self.StaticStopWordEliminator = StaticStopwordRemover()
        self.DynamicStopWordEliminator = DynamicStopWordEliminator()
示例#11
0
 def __init__(self):
     self.model_ca = NaiveBayesClassifier(
         0, 3, 0.3, "./training_files/ca_training-tweets.txt", "ca", 18318)
     self.model_gl = NaiveBayesClassifier(
         0, 3, 0.3, "./training_files/gl_training-tweets.txt", "gl", 18318)
     self.model_en = NaiveBayesClassifier(
         0, 3, 0.3, "./training_files/en_training-tweets.txt", "en", 18318)
     self.model_es = NaiveBayesClassifier(
         0, 3, 0.3, "./training_files/es_training-tweets.txt", "es", 18318)
     self.model_pt = NaiveBayesClassifier(
         0, 3, 0.3, "./training_files/pt_training-tweets.txt", "pt", 18318)
     self.model_eu = NaiveBayesClassifier(
         0, 3, 0.3, "./training_files/eu_training-tweets.txt", "eu", 18318)
     self.arrayModel = [
         self.model_ca, self.model_gl, self.model_en, self.model_es,
         self.model_pt, self.model_eu
     ]
     self.probability = 0
     self.totalTweetCount = 18318
示例#12
0
def main():

    np.set_printoptions(threshold=sys.maxsize)
    # separate data into training and testing
    training = np.zeros((1, 2), dtype=str)
    testing = np.zeros((1, 2), dtype=str)
    genres = set()

    os.chdir('../Corpus')
    with open('movie_titles_metadata.txt', encoding='utf-8',
              errors='ignore') as file:
        for line in file:
            line = line.strip().split(" +++$+++ ")
            for x in ast.literal_eval(line[5]):
                genres.add(x)
            arr = [[line[0], line[5]]]
            r = random.random()
            if r < 0.80:
                training = np.concatenate((training, arr), axis=0)
            else:
                testing = np.concatenate((testing, arr), axis=0)

    file.close()
    training = np.delete(training, 0, axis=0)
    testing = np.delete(testing, 0, axis=0)

    # train and test model
    start_time = time.time()
    features = extract_features('movie_lines.txt')
    model = NaiveBayesClassifier()
    print("TRAINING:")
    model.fit(training, features, genres)
    pred_labels, correct_labels, genres = model.predict(training, features)
    accuracy(pred_labels, correct_labels, genres)
    fscore(pred_labels, correct_labels, genres)
    print("TESTING:")
    pred_labels, correct_labels, genres = model.predict(testing, features)
    accuracy(pred_labels, correct_labels, genres)
    fscore(pred_labels, correct_labels, genres)

    print("Time for training and test: %.2f seconds" %
          (time.time() - start_time))
示例#13
0
# loading data
dataset = pd.read_csv('iris.data.txt', names=names).values
# dataset = pd.read_csv('column_3C.dat.txt', names=names).values
dataframe = pd.read_csv('breast-cancer-wisconsin.data.csv', names=names)
dataframe = dataframe.drop(['id', 'bare_nucleoli'], axis=1)
# dataframe = pd.read_csv('dermatology.csv')
# dataframe = dataframe.drop(['age'], axis=1)
# dataset = dataframe.values

# artificial dataset
# features_1 = np.array([[random.uniform(0.4, 0.6), random.uniform(0.4, 0.6), 0] for _ in range(50)])
# features_2 = np.array([[random.uniform(-0.1, 0.1), random.uniform(0.9, 1.1), 1] for _ in range(50)])
# features_3 = np.array([[random.uniform(0.9, 1.1), random.uniform(0.9, 1.1), 2] for _ in range(50)])
# dataset = np.concatenate([features_1, features_2, features_3], axis=0)

bayesClassifier = NaiveBayesClassifier()
# bayesClassifier = BayesLDAClassifier()
# bayesClassifier = BayesQDAClassifier()
dataset = bayesClassifier.normalize_dataset(dataset)
accuracies = []

for j in range(0, 1):
    print("realization %d" % j)
    train_X, train_y, test_X, test_y = bayesClassifier.train_test_split(dataset)
    bayesClassifier.fit(train_X, train_y)

    predictions = bayesClassifier.predict(test_X)
    accuracies.append(bayesClassifier.evaluate(test_y, predictions))
    print(bayesClassifier.confusion_matrix(test_y, predictions))
    bayesClassifier.plot_decision_boundaries(train_X, train_y, test_X, test_y, j)
示例#14
0
    test_score = []
    print("LSA created.")

    ###########################
    # LSA
    human_keywords = l.manage_keywords(f.keywords)
    lsa_results = l.train_phrases(human_keywords)
    print("LSA Results computed.")
    for j in range(50):
        sets = Set(lsa_results, f.y, f.x)
        for i in range(len(sets.x_train)):
            ###########################

            ###########################
            # NAIVE BAYES
            naive = NaiveBayesClassifier(alpha=0.01)
            naive.train(numpy.array(sets.x_train[i]), sets.y_train[i])
            test_score.append(
                naive.test_score(numpy.array(sets.x_test[i]),
                                 numpy.array(sets.y_test[i])))
    avg = numpy.round(numpy.average(numpy.array(test_score)), 2)
    y.append(avg)
    min_ = numpy.round(numpy.array(test_score).min(), 2)
    yerrormin.append(numpy.round(avg - min_, 2))
    max_ = numpy.round(numpy.array(test_score).max(), 2)
    yerrormax.append(numpy.round(max_ - avg, 2))
    print("Avg test performance: ", avg)
    print(min_)
    print(max_)
    print('\n' * 3)
from NaiveBayesClassifier import NaiveBayesClassifier
import GeneticAlgorithm
import time

naiveBayes = NaiveBayesClassifier()

# read training data
df = naiveBayes.loadCSV('V1.4_Training_new.csv')
contractions = naiveBayes.loadCSV('Contractions.csv', 'contraction',
                                  'text1').to_dict('split')

df['label'] = df.sentence.str.extract(r'((\b\w+)[\.?!\s]*$)')[0]

df['label'] = df.label.map(lambda x: 'suggestion'
                           if x == '1' else 'nonsuggestion')

# preprocessing training data
df = naiveBayes.Preprocessing(df, contractions)

# First element is an array containing the words and second element is the type of sentiment.
# We get rid of the words smaller than 2 characters and we use lowercase for everything.
allsentences = [(row['sentence'], row['label'])
                for index, row in df.iterrows()]
trainingsentences = [(([e.lower() for e in words.split()
                        if len(e) >= 3], sentiment))
                     for (words, sentiment) in allsentences]

GA = GeneticAlgorithm
chromosomeManager = GA.ChromosomeManager()
chromosomeManager.sentences = trainingsentences
word_features = naiveBayes.getWordFeatures(trainingsentences)
示例#16
0
# Spencer Barton
# 10-601
# Naive Bayes Classifier

from NaiveBayesClassifier import NaiveBayesClassifier
import sys

#===============================================
# Script
#===============================================

TRAIN_FILE_NAME = sys.argv[1]

NB = NaiveBayesClassifier()
NB.train(TRAIN_FILE_NAME)
topWords = NB.getSortedWords()

N = 20
for i in xrange(0,N):
	pair = topWords['lib'][i]
	print pair[0], round(pair[1],4)

print

for i in xrange(0,N):
	pair = topWords['con'][i]
	print pair[0], round(pair[1],4)
示例#17
0
文件: nb.py 项目: sbarton272/10601
# Spencer Barton
# 10-601
# Naive Bayes Classifier

from NaiveBayesClassifier import NaiveBayesClassifier
import sys

#===============================================
# Script
#===============================================

TRAIN_FILE_NAME = sys.argv[1]
TEST_FILE_NAME  = sys.argv[2]

NB = NaiveBayesClassifier()
NB.train(TRAIN_FILE_NAME)
NB.test(TEST_FILE_NAME)
示例#18
0
 def __init__(self):
     self.bag = BagOfWords()
     self.nbc = NaiveBayesClassifier()
        l = LSA(MAX_GRAM, MIN_FREQ, P_EIG, x)
        print("Parameters: Min_freq =", l.min_freq,"NGram_max =", l.ngram_max, "P_eig =", l.p_eig*100)
        print("LSA created.")

        ###########################
        # LSA
        human_keywords = l.manage_keywords(f.keywords)
        lsa_results = l.train_phrases(human_keywords)
        print("LSA Results computed.")
        sets = Set(lsa_results, numpy.array(y), numpy.array(x))
        for i in range(len(sets.x_train)):
            ###########################

            ###########################
            # NAIVE BAYES
            naive = NaiveBayesClassifier(alpha=ALPHA)
            naive.train(numpy.array(sets.x_train[i]), sets.y_train[i])
            test_score.append(naive.test_score(numpy.array(sets.x_test[i]), numpy.array(sets.y_test[i])))
    if not test_score:
        break
    elements.append(n_elements)
    avg = numpy.round(numpy.average(numpy.array(test_score)), 2)
    classification.append(avg)
    min_ = numpy.round(numpy.array(test_score).min(), 2)
    classificationerrormin.append(numpy.round(avg - min_, 2))
    max_ = numpy.round(numpy.array(test_score).max(), 2)
    classificationerrormax.append(numpy.round(max_ - avg, 2))
    print("Avg test performance: ", avg)
    print(min_)
    print(max_)
    print('\n'*3)
示例#20
0
# Spencer Barton
# 10-601
# Naive Bayes Classifier

from NaiveBayesClassifier import NaiveBayesClassifier
import sys

#===============================================
# Script
#===============================================

TRAIN_FILE_NAME = sys.argv[1]
TEST_FILE_NAME = sys.argv[2]
N_STOP_WORDS = int(sys.argv[3])

NB = NaiveBayesClassifier(nStopWords=N_STOP_WORDS)
NB.train(TRAIN_FILE_NAME)
NB.test(TEST_FILE_NAME)
示例#21
0
文件: script2.py 项目: h-s1996/Thesis
        print("LSA created.")

        ###########################
        # LSA
        human_keywords = lsa.manage_keywords(f.keywords)
        print("Start", datetime.datetime.now())
        aux1 = datetime.datetime.now()
        ex1 = lsa.process_examples(human_keywords, train_set[i])
        ex1.shutdown(wait=True)
        print("LSA Results computed.")
        ###########################

        ###########################
        # NAIVE BAYES

        naive = NaiveBayesClassifier(alpha=0.01)
        ex2 = lsa.process_examples(human_keywords, test_set[i])
        naive.train(numpy.array(train_set[i].get_lsa_results()),
                    train_set[i].get_class_labels())
        ex2.shutdown(wait=True)
        test_score.append(
            naive.test_score(numpy.array(test_set[i].get_lsa_results()),
                             numpy.array(test_set[i].get_class_labels()),
                             "test"))
        naive.test_score(numpy.array(train_set[i].get_lsa_results()),
                         numpy.array(train_set[i].get_class_labels()), "train")
        print("End", datetime.datetime.now())
        aux2 = datetime.datetime.now()
        time_score.append(aux2 - aux1)
        print("Difference", aux2 - aux1)
    print("Avg test performance: ", numpy.average(numpy.array(test_score)))
import requests

from NaiveBayesClassifier import NaiveBayesClassifier
from Ploter import Ploter
from Tokenizer import Tokenizer

if __name__ == '__main__':
    nbc = NaiveBayesClassifier()

    urls = open('Testset/test.txt','r').readlines()
    url = urls[4].split(" ")
    print "Class of the newspaper article is: " + str(url[0])
    print "URL: " + str(url[1])

    #url = raw_input("Enter URL:")

    response = requests.get(url[1])
    if response.status_code == 200:
        t = Tokenizer(response.content)
        clearText = t.clear
        print "Parsed html: " + str(clearText)

        prediction = nbc.predict(clearText)
        percentages = [round(x * 100, 2) for x in prediction[0]]
        result = zip(nbc.bbc_train.target_names, percentages)
        print "Prediction probability in percentage: " + str(result)

        labels = nbc.bbc_train.target_names
        pl = Ploter(labels, percentages)
        pl.drawPie()
示例#23
0
    def NaiveBayesDriver(self):
        naivebayes = NaiveBayesClassifier(self.train_x, self.train_y,
                                          self.test_x, self.test_y)
        # -----= naiveBayes
        naivebayes.train_1()
        naivebayes_labels = naivebayes.predic()
        naivebayes_acc = naivebayes.getAccuracy()
        naivebayes.printResult()
        self.acc['naivebayes-GaussianNB'] = {
            'accuracy': naivebayes_acc,
            'train-time': naivebayes.trainTime(),
            'test-time': naivebayes.testTime(),
        }

        naivebayes.train_2()
        naivebayes_labels_2 = naivebayes.predic()
        naivebayes_acc = naivebayes.getAccuracy()
        naivebayes.printResult()
        self.acc['naivebayes-MultinomialNB'] = {
            'accuracy': naivebayes_acc,
            'train-time': naivebayes.trainTime(),
            'test-time': naivebayes.testTime(),
        }

        naivebayes.train_3()
        naivebayes_labels_3 = naivebayes.predic()
        naivebayes_acc = naivebayes.getAccuracy()
        naivebayes.printResult()
        self.acc['naivebayes-ComplementNB'] = {
            'accuracy': naivebayes_acc,
            'train-time': naivebayes.trainTime(),
            'test-time': naivebayes.testTime(),
        }

        # -----= naiveBayes
        naivebayes.train_1()
        naivebayes_labels = naivebayes.predic()
        naivebayes_acc = naivebayes.getAccuracy()
        naivebayes.printResult()
        self.acc['naivebayes-GaussianNB'] = {
            'accuracy': naivebayes_acc,
            'train-time': naivebayes.trainTime(),
            'test-time': naivebayes.testTime(),
        }
示例#24
0
            contents = list(content)
            contents = map(lambda x: 'space' if x == ' ' else x, contents)
            wekaContent = ' '.join(contents)
            fname = 'weka/%c/%d.txt' % (label, i)
            writeFile(fname, wekaContent)
    print('Done')


labels = ['e', 'j', 's']
features = list(string.ascii_lowercase) + [' ']
df = pd.DataFrame(columns=features + ['label'])
for label in labels:
    for i in range(0, 10):
        fname = 'languageID/%c%d.txt' % (label, i)
        dfRow = getAttributes(fname)
        dfRow['label'] = label
        df = df.append(dfRow, ignore_index=True)
dfX = df.loc[:, features]
dfY = df.loc[:, 'label']
X = dfX.to_numpy()
y = dfY.to_numpy()

classifier = NaiveBayesClassifier(labels)
classifier.fit(X, y)
q1_3()
q4_6()
q7_8()
weka()

print('Done')
示例#25
0
class CuisineClassification:
    def __init__(self):
        self.bag = BagOfWords()
        self.nbc = NaiveBayesClassifier()

    def classify_check(self,
                       train_json,
                       train_ratio,
                       learn_ratio,
                       randomize=False):
        with Timer('Naive Bayes Classifier, Classify Check',
                   indent=0) as total_t:
            with Timer('Loading Recipes for Training') as t:
                with open(train_json) as train_file:
                    recipes = json.load(train_file)
                    if randomize:
                        random.shuffle(recipes)
                    train_size = int(len(recipes) * train_ratio)
                    test_size = int(len(recipes) * learn_ratio)
                    train_recipes = recipes[:train_size]
                    test_recipes = recipes[train_size:(train_size + test_size)]
                    t.update(
                        'Loaded {}(training) and {}(testing) recipes'.format(
                            len(train_recipes), len(test_recipes)))

            with Timer('Building Bag of Words') as t:
                self.bag.build_vocabulary(train_recipes)
                t.update('Built bag with {} sized vocabulary'.format(
                    self.bag.count))

            with Timer('Building Training Vectors') as t:
                train_vectors = self.bag.build_vectors(train_recipes)
                t.update('Built {} vectors'.format(len(train_vectors)))

            with Timer('Building Testing Vectors') as t:
                test_vectors = self.bag.build_vectors(test_recipes)
                t.update('Built {} vectors'.format(len(test_vectors)))

            with Timer('Training Bayes Classifier') as t:
                train_vectors = self.bag.build_vectors(train_recipes)
                self.nbc.train(train_vectors)
                t.update('Trained with {} vectors'.format(
                    self.nbc.cuisine_total))

            with Timer('Making Predictions') as t:
                total = 0
                correct = 0
                for n in range(len(test_vectors)):
                    vector = test_vectors[n]
                    recipe = test_recipes[n]
                    prediction = self.nbc.classify(vector)
                    total += 1
                    if prediction == recipe['cuisine']:
                        correct += 1
                t.update(
                    'Finished {} predictions with accuracy of {:.1%}'.format(
                        total, correct / float(total)))
            total_t.update('Total')

    def classify_test(self, train_json, test_json, prediction_csv, scale):
        with Timer('Naive Bayes Classifier, Classify Test',
                   indent=0) as total_t:
            with Timer('Loading Recipes for Training') as t:
                with open(train_json) as train_file:
                    train_recipes = json.load(train_file)
                    train_recipes = train_recipes[:int(
                        len(train_recipes) * scale)]
                    t.update('Loaded {} training recipes'.format(
                        len(train_recipes)))

            with Timer('Loading Recipes for Testing') as t:
                with open(test_json) as test_file:
                    test_recipes = json.load(test_file)
                    test_recipes = test_recipes[:int(
                        len(test_recipes) * scale)]
                    t.update('Loaded {} testing recipes'.format(
                        len(test_recipes)))

            with Timer('Building Bag of Words') as t:
                self.bag.build_vocabulary(train_recipes)
                t.update('Built bag with {} sized vocabulary'.format(
                    self.bag.count))

            with Timer('Building Training Vectors') as t:
                train_vectors = self.bag.build_vectors(train_recipes)
                t.update('Built {} vectors'.format(len(train_vectors)))

            with Timer('Building Testing Vectors') as t:
                test_vectors = self.bag.build_vectors(test_recipes)
                t.update('Built {} vectors'.format(len(test_vectors)))

            with Timer('Training Bayes Classifier') as t:
                train_vectors = self.bag.build_vectors(train_recipes)
                self.nbc.train(train_vectors)
                t.update('Trained with {} vectors'.format(
                    self.nbc.cuisine_total))

            with Timer('Writing Predictions') as t:
                predictions = 0
                with open(prediction_csv, "wt") as prediction_file:
                    writer = csv.writer(prediction_file)
                    writer.writerow(['id', 'cuisine'])

                    for n in range(len(test_vectors)):
                        predictions += 1
                        vector = test_vectors[n]
                        recipe_id = test_recipes[n]['id']
                        prediction = self.nbc.classify(vector)
                        writer.writerow([recipe_id, prediction])
                t.update('Wrote out {} predictions'.format(predictions))
            total_t.update('Total')
示例#26
0
# let coding = begin

from NaiveBayesClassifier import NaiveBayesClassifier
import pandas as pandas

dataSet = [[182, 81.6, 30, 'masculin'], [180, 86.2, 28, 'masculin'],
           [170, 77.1, 30, 'masculin'], [180, 74.8, 25, 'masculin'],
           [152, 45.4, 15, 'feminin'], [168, 68.0, 20, 'feminin'],
           [165, 59.0, 18, 'feminin'], [165, 59.0, 23, 'feminin']]

test = [[183, 59, 20]]

nbc = NaiveBayesClassifier(dataSet)
nbc.pridection(test)
示例#27
0
v = re.compile(r'^[-+]?[0-9]+(\.[0-9]+)?$')  # float or int reg

import xlrd
import numpy as np
from NaiveBayesClassifier import NaiveBayesClassifier

# load data
data = xlrd.open_workbook('../WTMLDataSet_3.0.xlsx')
table = data.sheet_by_name('WTML')

dataset = []
for i in range(table.nrows):
    line = table.row_values(i)
    dataset.append(line)
dataset = np.array(dataset)

xs = dataset[1:, 1:-1]
ys = (dataset[1:, -1] == '否').astype(np.int32)
isdiscs = np.array([not bool(v.match(val)) for val in xs[0]])
labels = ['好瓜', '坏瓜']

# build naive Bayes classifier
classifier = NaiveBayesClassifier(xs, isdiscs, ys)

# input
test_x = ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.697, 0.460]
print("Input: \t%ls" % test_x)

# output
test_y = classifier.classify(np.array(test_x))
print("Output: %s" % labels[test_y])
示例#28
0
        train_corpus[k] = lda_models[k][tmp_train]
        test_corpus[k] = lda_models[k][tmp_test]
        train_label[k] = labels[k][:int(len(labels[k]) * 0.8)]
        test_label[k] = labels[k][int(len(labels[k]) * 0.8):]

        train_dat[k] = [(format_vector_as_dict(d), l)
                        for (d,
                             l) in zip(train_corpus[k].corpus, train_label[k])]
        test_dat[k] = [(format_vector_as_dict(d), l)
                       for (d, l) in zip(test_corpus[k].corpus, test_label[k])]

    # training phase
    print 'Start training Naive Bayes Classifier'
    for k in train_dat.keys():
        classifier = NaiveBayesClassifier.train(train_dat[k])

        # test the accuracy
        print 'Testing'
        results = classifier.batch_classify([fs for (fs, l) in test_dat[k]])
        correct = [l == r for ((fs, l), r) in zip(test_dat[k], results)]
        if correct:
            acc = float(sum(correct)) / len(correct)
        else:
            acc = 0

        print k, acc

    # translate feature
    # dataset = [(format_vector_as_dict(d), l) for (d, l) in zip(corpus, labels)]
示例#29
0
class MLBasedTokenizer:
    def __init__(self):
        self.NBC = NaiveBayesClassifier()
        self.LRC = LogisticRegression()
        abbreviations = []

        self.abbreviations = abbreviations[1:]
        self.punctuations1 = ['!', '?', ',', '"', '(', ')']  #Always token
        self.punctuations2 = ['.', ':']  #Ambiguous
        self.split_characters = [' ']

    def create_features(self, string_of_sentences):

        # String of sentences = all tokens made into a string.

        combined_sentences = string_of_sentences

        features = []
        length = len(combined_sentences)

        # For all inputs check for features
        # For all inputs create a feature list
        # features = (input number, feature #)
        # Think of every character as input
        # Check every character (except first and last one)

        for index in range(0, length - 1):
            # Features
            # - Is next character a splitter?
            # - Is next character a punctuation?
            # - Is character a punctuation?

            # - Is previous character quotation mark or closing paranthesis?
            # - Is next character quotation mark or opening paranthesis?

            is_next_char_splitter = combined_sentences[
                index + 1] in self.split_characters

            is_prev_char_punc = combined_sentences[index -
                                                   1] in self.punctuations1
            is_next_char_punc = combined_sentences[index +
                                                   1] in self.punctuations1

            is_prev_char_punc2 = combined_sentences[index -
                                                    1] in self.punctuations2
            is_next_char_punc2 = combined_sentences[index +
                                                    1] in self.punctuations2
            is_char_punc2 = combined_sentences[index] in self.punctuations2

            is_prev_char_numeric = combined_sentences[index - 1].isnumeric()
            is_char_numeric = combined_sentences[index].isnumeric()
            is_next_char_numeric = combined_sentences[index + 1].isnumeric()

            if index + 2 <= index:
                is_next_next_char_numeric = combined_sentences[index +
                                                               2].isnumeric()
            else:
                is_next_next_char_numeric = is_char_numeric

            features.append([
                is_next_char_splitter, is_next_char_punc, is_prev_char_punc,
                is_next_char_punc2, is_prev_char_punc2, is_char_punc2,
                is_prev_char_numeric, is_next_char_numeric
            ])

        #print(np.array(features) * 1)
        return np.array(features) * 1

    def create_labels(self, string_of_sentences):  # list of sentences

        # Create labels for all inputs (characters)
        # Except first and last character.

        length = len(string_of_sentences)
        input_length = length - 1
        y = np.zeros(input_length)

        # Create labels based on whether the character is a split or not.

        split_positions = []
        for index in range(0, length - 1):
            if string_of_sentences[index] == ' ':
                split_positions.append(index - 1)
            if string_of_sentences[index] == ',':
                split_positions.append(index - 1)

            if string_of_sentences[index] == ')':
                split_positions.append(index - 1)
            if string_of_sentences[index] == '(':
                split_positions.append(index + 1)

            if string_of_sentences[index] == '.':
                if not string_of_sentences[index - 1].isnumeric():
                    if not string_of_sentences[index + 1].isnumeric():
                        split_positions.append(index - 1)

        # Insert labels to y:
        for position in split_positions:
            if position < input_length:
                y[position] = 1

        # Labeller okey, şuan merhaba. ayır diyor,
        # 22.12 ayırma diyor.

        return y

    def fit(self, string_of_sentences, model='NBC'):

        X = self.create_features(string_of_sentences)
        y = self.create_labels(string_of_sentences)

        if model == 'LogisticRegression':
            self.LRC.fit(X, y)
        else:
            self.NBC.fit(X, y)
        return X, y

    def predict(self, string_of_sentences, model='NBC'):
        X = self.create_features(string_of_sentences)
        if model == 'LogisticRegression':
            preds = self.LRC.predict(X)
        else:
            preds = self.NBC.predict(X)
        return [int(i) for i in preds]

    def split_to_tokens(self, string_of_sentences, model='NBC'):

        X = self.create_features(string_of_sentences)
        splitted_sentences = string_of_sentences

        if model == 'LogisticRegression':
            preds = self.LRC.predict(X)
        else:
            preds = self.NBC.predict(X)

        # converting boolean predictions into actual splitted tokens

        length = len(preds)  # Equals to input length
        split_locations = [0]
        tokens = []

        for index in range(length):
            if preds[index] == 1:
                split_locations.append(index + 1)
                splitted_sentences = splitted_sentences[:index +
                                                        1] + '*' + splitted_sentences[
                                                            index + 2:]

            if index == length - 1 and preds[index] == 0:
                split_locations.append(index + 2)

        length = len(split_locations)

        for index in range(length - 1):
            token = string_of_sentences[
                split_locations[index]:split_locations[index + 1]]
            if token == ' ':
                token = string_of_sentences[split_locations[index] +
                                            1:split_locations[index + 1] + 1]
                split_locations[index + 1] = split_locations[index + 1] + 1
            tokens.append(token)

        # Check for the last character
        if preds[len(preds) - 1] == 1:
            tokens.append(string_of_sentences[-1])

        return tokens
示例#30
0
###########################
# LSA
l = LSA(MAX_GRAM, MIN_FREQ, P_EIG, f.x)
print("Parameters: Min_freq =", l.min_freq,"NGram_max =", l.ngram_max, "P_eig =", l.p_eig*100)
human_keywords = l.manage_keywords(f.keywords)
lsa_results = l.train_phrases(human_keywords)
#n_labels = [len(list(group)) for key, group in groupby(f.y)]

print("LSA Results computed.")
sets = Set(lsa_results, f.y, f.x)
for i in range(len(sets.x_train)):
    #error_per_class = numpy.zeros(22)
    #errors = 0
    ###########################
    # NAIVE BAYES
    naive = NaiveBayesClassifier(alpha=ALPHA)
    naive.train(numpy.array(sets.x_train[i]), sets.y_train[i])
    test_score.append(naive.test_score(numpy.array(sets.x_test[i]), numpy.array(sets.y_test[i])))
    for j in range(len(sets.x_test[i])):
        predicted_class = f.search_for_phrase(naive, sets.x_test[i][j])
        r_class = sets.y_test[i][j]
        if (r_class != predicted_class):
            if(r_class == 4 or r_class == 5 or r_class == 6 or r_class == 7 or r_class == 9 or r_class == 10 or r_class == 11 or r_class == 12 or r_class == 14):
                print(numpy.round(naive.all_classes_result(sets.x_test[i][j]),2))
                for o in range(len(sets.x_test[i][j])):
                    if sets.x_test[i][j][o] > 0.1:
                        print("Prob of term " + l.features_utterance[o] + " of real class:", math.exp(naive.classifier.feature_log_prob_[r_class][o]))
                        print("Prob of term " + l.features_utterance[o] + " of predicted class:", math.exp(naive.classifier.feature_log_prob_[predicted_class][0][o]))
                        print(sets.x_test[i][j][o])
                        print('\n')
                #print(l.features_utterance[o])
示例#31
0
# Spencer Barton
# 10-601
# Naive Bayes Classifier

from NaiveBayesClassifier import NaiveBayesClassifier
import sys

#===============================================
# Script
#===============================================

TRAIN_FILE_NAME = sys.argv[1]

NB = NaiveBayesClassifier()
NB.train(TRAIN_FILE_NAME)
topWords = NB.getSortedWords()

N = 20
for i in xrange(0, N):
    pair = topWords['lib'][i]
    print pair[0], round(pair[1], 4)

print

for i in xrange(0, N):
    pair = topWords['con'][i]
    print pair[0], round(pair[1], 4)
示例#32
0
def cleanText(read_data):
    # 텍스트에 포함되어 있는 특수 문자 제거
    text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '',
                  read_data).replace('\n', '').replace('\t', '')
    return text


def get_test_file():
    path_dir = '/Users/sinsuung/Workspace/Python/unstructured_data_final_project/corpus/test/'
    file_list = os.listdir(path_dir)  # path 에 존재하는 파일 목록 가져오기
    file_list.sort()  # 파일 이름 순서대로 정렬

    for i in file_list:
        f = open(path_dir + i)
        test_list.append(f.read())


if __name__ == "__main__":
    model = NaiveBayesClassifier()
    df = pd.read_csv(
        '/Users/sinsuung/Workspace/Python/unstructured_data_final_project/corpus/dev/out/result.csv',
        delimiter=',',
        header=None,
        names=['LABLE', 'CONTENT'],
        encoding='utf-8')
    model.train(df)
    test_list = []
    get_test_file()

    for i in test_list:
        model.category_probability(i)