示例#1
0
    def __init__(self, db_pool=None):
        self.pool = None
        self.conn = None

        if db_pool is None:
            Logger.warn('can not get database connection pool from env')
            Logger.info(
                'create a new temporary database connection for this call')
            self.conn = Connection(
                host=os.environ['DB_HOST'],
                user=os.environ['DB_USER'],
                password=os.environ['DB_PWD'],
                database=os.environ['DB_NAME'],
            )
        else:
            self.pool = db_pool
class NaiveBayers():
    def __init__(self, verbose=True, training_cases=2500, testcases=500):
        self.verbose = verbose
        self.training_cases = training_cases
        self.testcases = testcases
        self.training = list()
        self.test = list()
        self.frequency = dict()
        self.stop_words = self.get_stop_words()
        self.positive_words = 0
        self.negative_words = 0
        self.positive_sentence_count = 0
        self.negative_sentence_count = 0
        self.total_sentences = 0
        self.logger = Logger('NaiveBayers', 'NaiveBayers.log')
        self.filenames = [
            'res\\benchmark\\yelp_labelled.txt',
            'res\\benchmark\\amazon_cells_labelled.txt',
            'res\\benchmark\\imdb_labelled.txt'
        ]

    def _print(self, message):
        if self.verbose:
            print(message)

    def clean(self, sentence):
        ignore_characters = '''\t\n&"`~@#$%^*;+=<>//.,()[]{}:;!?'''
        sentence = self.replace_characters(sentence, ignore_characters)
        return sentence.lower().strip()

    def tokenise(self, sentence):
        sentence = self.clean(sentence)
        tokens = sentence.split(' ')
        filtered_tokens = list()
        for token in tokens:
            if len(token.strip()) != 0:
                filtered_tokens.append(token)
        return filtered_tokens

    def replace_characters(self, text, characters):
        for char in characters:
            text = text.replace(char, ' ')
        return text

    def get_data(self):
        data = list()
        for filename in self.filenames:
            self._print("Filename : " + filename)
            for datum in tqdm(self.load_data_from_file(filename)):
                sentence, label = datum.split('\t')
                label = int(label)
                sentence = self.clean(sentence)
                data.append([sentence, label])
        self.training = data[:self.training_cases]
        self.test = data[-self.testcases:]

    def load_data_from_file(self, filename, encoding="utf8"):
        with open(filename, encoding=encoding) as file:
            data = file.readlines()
        return data

    def get_kgrams(self, sentence, k=1):
        grams = list()
        for i in range(len(sentence)):
            grams.append(sentence[i:i + k])
            if i + k >= len(sentence):
                break
        return grams

    def train(self):
        # try:
        #     with open('frequency.pickle', "rb") as file:
        #         self.frequency = pickle.load(file)
        #     with open("count.pickle", "rb") as file:
        #         self.positive_words, self.negative_words = pickle.load(file)
        # except Exception as error:
        #     self.logger.debug("Frequency file not found")
        #     self.train_unigrams()
        self.find_frequency_unigrams()
        self.train_from_negative_sentences()
        self.train_from_positive_sentences()
        # print(self.positive_words)
        # print(self.negative_words)
        # print(len(self.frequency))
        self.find_probablility_unigrams()
        # print(len(self.probablility))
        self.logger.info("Training completed")
        self.logger.info("Number of positive sentences : " +
                         str(self.positive_sentence_count))
        self.logger.info("Number of negative sentences : " +
                         str(self.negative_sentence_count))

    def classify(self, sentence):
        sentence = self.preprocess(sentence)
        positive_probablity = self.positive_sentence_count / self.total_sentences
        negative_probablity = self.negative_sentence_count / self.total_sentences
        self.logger.debug("sentence : " + str(sentence))
        self.logger.debug("words considered : ")
        for word in sentence:
            word = word[0]
            word_positive_probability, word_negative_probability = 1, 1
            if word in self.probablility:
                word_positive_probability, word_negative_probability = self.probablility[
                    word]
                self.logger.debug("word : " + word +
                                  " word_positive_probability : " +
                                  str(word_positive_probability) +
                                  " word_negative_probability : " +
                                  str(word_negative_probability))
            positive_probablity *= word_positive_probability
            negative_probablity *= word_negative_probability

        self.logger.debug("positive_probablity : " + str(positive_probablity))
        self.logger.debug("negative_probablity : " + str(negative_probablity))

        # if abs(positive_probablity - negative_probablity) < 0.0000000000000001:
        #     self.logger.debug("sentence is neutral")
        #     return ("neutral" , -1)
        if positive_probablity > negative_probablity:
            self.logger.debug("sentence is positive")
            return ("positive", 1)
        if negative_probablity > positive_probablity:
            self.logger.debug("sentence is negative")
            return ("negative", 0)

    def test_classifier(self):
        correct, wrong = 0, 0
        total = len(self.test)
        for sentence, actual_label in self.test:
            verdict, label = self.classify(sentence)
            if label == actual_label:
                correct += 1
            else:
                wrong += 1

        self.logger.info("correct : " + str(correct))
        self.logger.info("wrong : " + str(wrong))
        self.logger.info("total : " + str(total))
        self.logger.info("accuracy : " + str(int((correct / total) * 100)))

    def get_stop_words(self):
        data = self.load_data_from_file('res\\eng_stop_words.txt')
        return set([datum.replace('\n', '') for datum in data])

    def remove_stop_words(self, sentence):
        filtered_words = list()
        for word in sentence:
            if word in self.stop_words:
                continue
            filtered_words.append(word)
        return filtered_words

    def find_probablility_unigrams(self):
        self.probablility = dict()
        for word in self.frequency:
            positive_probablity = (self.frequency[word][0] + 1) / (
                self.positive_words + len(self.frequency))
            negative_probablity = (self.frequency[word][1] + 1) / (
                self.negative_words + len(self.frequency))
            self.probablility[word] = [
                positive_probablity, negative_probablity
            ]

    def preprocess(self, sentence):
        sentence = self.tokenise(sentence)
        #sentence = self.remove_stop_words(sentence)
        sentence = self.get_kgrams(sentence, k=1)
        return sentence

    def train_from_negative_sentences(self):
        negative_files = ['res\\rt-polaritydata\\rt-polarity-neg.txt']
        for filename in negative_files:
            new_sentences = self.load_data_from_file(filename)

        for sentence in new_sentences:
            sentence = self.preprocess(sentence)
            self.negative_sentence_count += 1
            for word in sentence:
                word = word[0]
                if word not in self.frequency:
                    self.frequency[word] = [0, 0]
                self.frequency[word][1] += 1
                self.negative_words += 1

    def train_from_positive_sentences(self):
        positive_files = ['res\\rt-polaritydata\\rt-polarity-pos.txt']

        data = list()
        for filename in positive_files:
            new_sentences = self.load_data_from_file(filename)

        for sentence in new_sentences:
            sentence = self.preprocess(sentence)
            self.positive_sentence_count += 1
            for word in sentence:
                word = word[0]
                if word not in self.frequency:
                    self.frequency[word] = [0, 0]
                self.frequency[word][0] += 1
                self.positive_words += 1

    def find_frequency_unigrams(self):
        for sentence, label in self.training:
            self.total_sentences += 1
            sentence = self.preprocess(sentence)
            if label == 1:
                #positive sentence
                self.positive_sentence_count += 1
                for word in sentence:
                    word = word[0]
                    if word not in self.frequency:
                        self.frequency[word] = [0, 0]
                    self.frequency[word][0] += 1
                    self.positive_words += 1
            elif label == 0:
                #negative sentence
                self.negative_sentence_count += 1
                for word in sentence:
                    word = word[0]
                    if word not in self.frequency:
                        self.frequency[word] = [0, 0]
                    self.frequency[word][1] += 1
                    self.negative_words += 1
示例#3
0
class BagOfWordSentiment():
    def __init__(self, no_of_grams=4, verbose=True, no_of_testcases=1000):
        self.verbose = verbose
        self.logger = Logger('BagOfWordSentiment',
                             'logs\\bag_of_words.log',
                             is_verbose=self.verbose)

        self.no_of_grams = no_of_grams

        self.double_negations, self.double_negations_collection = set(), set()
        self.negations, self.negation_collection = set(), set()
        self.positive_words, self.positive_word_collection = set(), set()
        self.negative_words, self.negative_word_collection = set(), set()

        self.no_of_testcases = no_of_testcases
        self.positve_test_bag = list()
        self.negative_test_bag = list()

    def ready(self):
        self.logger.info("Bag of words loading")
        self.load_data()
        self.logger.info("Bag of words ready")

    def classify(self, sentence):
        '''
        classifies the sentence to positve or negative or neutral using bag of words method
        '''
        positive_score, negative_score = self.find_score(sentence)

        if positive_score > negative_score:
            self.logger.info("sentence - " + sentence + " - is positive")
            return ("positive", 1, positive_score)

        if positive_score < negative_score:
            self.logger.info("sentence - " + sentence + " - is negative")
            return ("negative", 0, negative_score)

        if positive_score == negative_score:
            self.logger.info("sentence - " + sentence + " - is neutral")
            return ("neutral", -1, positive_score)

    def find_score(self, sentence):
        '''
        finds positive and negative score for a given sentence
        '''
        positive_score, negative_score = 0, 0
        self.logger.info("sentence : " + sentence)
        sentence = self.tokenise(sentence)
        self.logger.info("tokenised sentence after cleaning : " +
                         str(sentence))

        kgrams = list()
        for k in range(self.no_of_grams, 0, -1):
            kgrams.extend(self.get_kgrams(sentence, k))

        for kgram in kgrams:
            phrase = ' '.join(kgram)
            sentence = ' '.join(sentence)

            if phrase in sentence:
                self.logger.info("considering phrase '" + phrase + "' from '" +
                                 sentence + "'")
                #check this phrase for double negation
                contains_double_negation, remaining_phrase = self.is_double_negation(
                    phrase)
                if contains_double_negation:
                    if self.is_positive(remaining_phrase):
                        positive_score += 1
                        sentence = sentence.replace(phrase, ' ')
                        sentence = self.tokenise(sentence)
                        self.logger.info(
                            "double negation of positive phrase : " + phrase)
                        continue
                    if self.is_negative(remaining_phrase):
                        negative_score += 1
                        sentence = sentence.replace(phrase, ' ')
                        sentence = self.tokenise(sentence)
                        self.logger.info(
                            "double negation of negative phrase : " + phrase)
                        continue

                #check this phrase for negations
                contains_negation, remaining_phrase = self.is_negation(phrase)
                if contains_negation:
                    if self.is_positive(remaining_phrase):
                        negative_score += 1
                        sentence = sentence.replace(phrase, ' ')
                        sentence = self.tokenise(sentence)
                        self.logger.info("negation of positive phrase : " +
                                         phrase)
                        continue
                    if self.is_negative(remaining_phrase):
                        positive_score += 1
                        sentence = sentence.replace(phrase, ' ')
                        sentence = self.tokenise(sentence)
                        self.logger.info("negation of negative phrase : " +
                                         phrase)
                        continue

                #check for positive phrase
                if self.is_positive(phrase):
                    positive_score += 1
                    sentence = sentence.replace(phrase, ' ')
                    sentence = self.tokenise(sentence)
                    self.logger.info("positive phrase : " + phrase)
                    continue

                #check for negative phrase
                if self.is_negative(phrase):
                    negative_score += 1
                    sentence = sentence.replace(phrase, ' ')
                    sentence = self.tokenise(sentence)
                    self.logger.info("negative phrase : " + phrase)
                    continue

                self.logger.info("cannot deduce sentiment from phrase '" +
                                 phrase + "'")
            sentence = self.tokenise(sentence)

        return positive_score, negative_score

    def is_double_negation(self, phrase):
        '''
        checks whether a word is in bag of double negations
        '''
        for double_negation in self.double_negations:
            double_negation = double_negation + " "
            if phrase.startswith(double_negation):
                remaining_phrase = phrase.replace(double_negation, '')
                return True, remaining_phrase

        for double_negation in self.double_negations_collection:
            if phrase.startswith(double_negation):
                phrase_length = len(phrase.split(" "))
                double_negation_length = len(double_negation.split(" "))
                diff = phrase_length - double_negation_length

                if diff <= 0:
                    return False, phrase

                remaining_phrase = ' '.join(phrase.split(" ")[-diff:])
                return True, remaining_phrase
        return False, phrase

    def is_negation(self, phrase):
        '''
        checks whether a word is in bag of negations
        '''
        for negation in self.negations:
            negation = negation + " "
            if phrase.startswith(negation):
                remaining_phrase = phrase.replace(negation, '')
                return True, remaining_phrase

        for negation in self.negation_collection:
            if phrase.startswith(negation):
                phrase_length = len(phrase.split(" "))
                negation_length = len(negation.split(" "))
                diff = phrase_length - negation_length

                if diff <= 0:
                    return False, phrase

                remaining_phrase = ' '.join(phrase.split(" ")[-diff:])
                return True, remaining_phrase
        return False, phrase

    def is_positive(self, word):
        '''
        checks whether a word is in bag of positive words
        '''
        if word in self.positive_words:
            return True
        for positive_word in self.positive_word_collection:
            if word.startswith(positive_word):
                return True
        return False

    def is_negative(self, word):
        '''
        checks whether a word is in bag of negative words
        '''
        if word in self.negative_words:
            return True
        for negative_word in self.negative_word_collection:
            if word.startswith(negative_word):
                return True
        return False

    def get_kgrams(self, sentence, k=1):
        '''
        return list of kgrams from a given sentence
        '''
        grams = list()
        for i in range(len(sentence)):
            grams.append(sentence[i:i + k])
            if i + k >= len(sentence):
                break
        return grams

    def load_data(self):
        '''
        loads the data necessary for analysis
        '''
        double_negation_files = [
            'res\\bag_of_words_dataset\\double_negation.txt'
        ]
        negations_files = ['res\\bag_of_words_dataset\\negation.txt']
        positive_word_files = ['res\\bag_of_words_dataset\\positive_words.txt']
        negative_word_files = ['res\\bag_of_words_dataset\\negative_words.txt']

        self.double_negations, self.double_negations_collection = self.get_words(
            self.load_data_from_files(double_negation_files))
        self.negations, self.negation_collection = self.get_words(
            self.load_data_from_files(negations_files))
        self.positive_words, self.positive_word_collection = self.get_words(
            self.load_data_from_files(positive_word_files))
        self.negative_words, self.negative_word_collection = self.get_words(
            self.load_data_from_files(negative_word_files))

        self.logger.info("words loaded")
        self.logger.info("double negations : " + str(
            len(self.double_negations) +
            len(self.double_negations_collection)))
        self.logger.info(
            "negations : " +
            str(len(self.negations) + len(self.negation_collection)))
        self.logger.info(
            "positive words : " +
            str(len(self.positive_words) + len(self.positive_word_collection)))
        self.logger.info(
            "negative words : " +
            str(len(self.negative_words) + len(self.negative_word_collection)))

    def get_words(self, input_words):
        '''
        cleans the input words and group them into set of words and
        set of mulitple word set(words that have different forms)
        '''
        words = set()
        multiple_words = set()
        for word in input_words:
            word = word.replace('\n', '').replace('(1)', '').replace("'", '')
            word = word.replace('_', ' ').replace('-', ' ').strip().lower()
            if '*' in word:
                word = word.replace('*', '')
                multiple_words.add(word.strip())
                continue
            words.add(word)
        return words, multiple_words

    def tokenise(self, sentence):
        '''
        split the sentence into words
        '''
        sentence = self.clean(sentence)
        tokens = sentence.split(' ')
        filtered_tokens = list()
        for token in tokens:
            if len(token.strip()) != 0:
                filtered_tokens.append(token)
        return filtered_tokens

    def clean(self, sentence):
        '''
        clean the sentence by removing ignored characters
        '''
        ignore_characters = '''\t\n&"`~@#$%^*;+=<>//.,()[]{!}?:;_-'''
        sentence = sentence.lower().strip()
        sentence = self.remove_stop_words(sentence)
        sentence = self.replace_characters(sentence, ignore_characters)
        sentence = sentence.replace("'", '')
        return sentence.lower().strip()

    def remove_stop_words(self, sentence):
        stop_words = self.load_data_from_files(
            ['res\\bag_of_words_dataset\\refined_stop_words.txt'])
        sentence = sentence.split(" ")
        stop_word_set = set()
        for stop_word in stop_words:
            stop_word_set.add(
                stop_word.replace('\n', '').replace('\t', '').strip())
        new_sentence = list()
        for word in sentence:
            if word not in stop_word_set:
                new_sentence.append(word)
        return ' '.join(new_sentence)

    def replace_characters(self, text, characters):
        '''
        replace the specified characters from text to blank spaces
        '''
        for char in characters:
            text = text.replace(char, ' ')
        return text

    def load_data_from_files(self, filenames, encoding="utf8"):
        '''
        load the data as a list from the specified filenames
        '''
        data = list()
        for filename in filenames:
            with open(filename, encoding=encoding) as file:
                data.extend(file.readlines())
        return data

    def find_accuracy(self):

        self.load_test_cases()
        self.create_test_set()

        correct, wrong = 0, 0
        total = len(self.positve_test_bag) + len(self.negative_test_bag)

        _correct, _wrong = self.test_for_bag(self.positve_test_bag,
                                             actual_result=1)
        correct += _correct
        wrong += _wrong

        _correct, _wrong = self.test_for_bag(self.negative_test_bag,
                                             actual_result=0)
        correct += _correct
        wrong += _wrong

        self.accuracy = (correct / total) * 100
        self.logger.info("total test sentences : " + str(total))
        self.logger.info("correct output : " + str(correct))
        self.logger.info("wrong output : " + str(wrong))
        self.logger.info("accuracy (%) : " + str(int(self.accuracy)))
        return (self.accuracy, total, correct, wrong)

    def test_for_bag(self, bag, actual_result):
        self.logger.is_verbose = False
        correct, wrong = 0, 0
        for sentence in bag:
            result = self.classify(sentence=sentence)
            if result[1] == actual_result:
                correct += 1
            else:
                wrong += 1
        self.logger.is_verbose = True
        self.logger.debug("total test sentences in bag : " + str(len(bag)))
        self.logger.debug("correct output : " + str(correct))
        self.logger.debug("wrong output : " + str(wrong))
        self.logger.debug("accuracy (%) : " +
                          str(int((correct / len(bag)) * 100)))
        return correct, wrong

    def create_test_set(self):
        '''
        randomly selects test sentences from positive and negative bags and making a uniform distribution of test sentences
        '''
        from numpy import random as np_random
        count = self.no_of_testcases // 2
        while (count != 0):
            index = np_random.random_integers(low=0,
                                              high=len(self.positive_bag) - 1)
            self.positve_test_bag.append(self.positive_bag.pop(index))
            index = np_random.random_integers(low=0,
                                              high=len(self.negative_bag) - 1)
            self.negative_test_bag.append(self.negative_bag.pop(index))
            count -= 1

        self.logger.info("test sentences selected")
        self.logger.info(
            "Total sentences for testing : " +
            str(len(self.positve_test_bag) + len(self.negative_test_bag)))
        self.logger.info("positive sentences for testing : " +
                         str(len(self.positve_test_bag)))
        self.logger.info("negative sentences for testing : " +
                         str(len(self.negative_test_bag)))

    def load_test_cases(self):
        '''
        loads the positive and negative sentences from filenames specified
        '''
        mixed_bag_paths = [
            'res\\dataset\\uci_dataset\\yelp_labelled.txt',
            'res\\dataset\\uci_dataset\\amazon_cells_labelled.txt',
            'res\\dataset\\uci_dataset\\imdb_labelled.txt'
        ]

        #followed training sets contain hard testcases
        positive_bag_paths = [
            'res\\dataset\\polarity_dataset\\rt-polarity-pos.txt'
        ]
        negative_bag_paths = [
            'res\dataset\polarity_dataset\\rt-polarity-neg.txt'
        ]
        #uncomment below two lines not to include difficult testcases
        # positive_bag_paths = []
        # negative_bag_paths = []

        self.positive_bag, self.negative_bag = list(), list()
        count_positive, count_negative = 0, 0
        for filename in mixed_bag_paths:
            for mixed_data in self.load_data_from_files([filename]):
                sentence, label = mixed_data.split('\t')
                label = int(label)
                if label == 1:  #if sentence is positive
                    self.positive_bag.append(sentence)
                    count_positive += 1
                else:
                    self.negative_bag.append(sentence)
                    count_negative += 1
        self.logger.debug("sentences from mixed bag imported")
        self.logger.debug("positive sentences : " + str(count_positive))
        self.logger.debug("negative sentences : " + str(count_negative))

        count_positive = 0
        for filename in positive_bag_paths:
            for sentence in self.load_data_from_files([filename]):
                self.positive_bag.append(sentence)
                count_positive += 1
        self.logger.debug("sentences from positive bag imported")
        self.logger.debug("positive sentences : " + str(count_positive))

        count_negative = 0
        for filename in negative_bag_paths:
            for sentence in self.load_data_from_files([filename]):
                self.negative_bag.append(sentence)
                count_negative += 1
        self.logger.debug("sentences from negative bag imported")
        self.logger.debug("negative sentences : " + str(count_negative))

        self.logger.debug("sentences imported")
        self.logger.debug("Total sentences : " +
                          str(len(self.positive_bag) + len(self.negative_bag)))
        self.logger.debug("positive sentences : " +
                          str(len(self.positive_bag)))
        self.logger.debug("negative sentences : " +
                          str(len(self.negative_bag)))
示例#4
0
class Comparer():
    def __init__(self, no_of_testcases=100, verbose=True, nb=None, bw=None):
        self.logger = Logger('Comparer',
                             'logs\\comparer.log',
                             is_verbose=verbose)
        self.load_html_structure()

        if nb is None:
            self.nb = NaiveBayes(verbose=False,
                                 test_set_count=no_of_testcases,
                                 no_of_grams=4)
            self.nb.ready()
        else:
            self.nb = nb
            self.nb.logger.is_verbose = False

        if bw is None:
            self.bw = BagOfWordSentiment(verbose=False, no_of_grams=4)
            self.bw.ready()
        else:
            self.bw = bw
            self.bw.logger.is_verbose = False

        self.no_of_testcases = no_of_testcases
        self.nb_correct, self.bw_correct, self.tb_correct = 0, 0, 0
        self.nb_wrong, self.bw_wrong, self.tb_wrong = 0, 0, 0
        self.nb_accuracy, self.bw_accuracy, self.tb_accuracy = 0, 0, 0

        self.counter = 0
        self.testcases = dict()

    def ready(self):
        self.positive_test_bag = self.nb.get_positive_test_bag()
        self.negative_test_bag = self.nb.get_negative_test_bag()

    def compare(self):
        '''
        compares sentiment analysis done through Naive Bayes and bag of words method
        with popular text processing library textblob.
        '''
        self.test_for_bag(self.positive_test_bag, 1)
        self.test_for_bag(self.negative_test_bag, 0)

        self.nb_accuracy = (self.nb_correct / len(self.testcases)) * 100
        self.bw_accuracy = (self.bw_correct / len(self.testcases)) * 100
        self.tb_accuracy = (self.tb_correct / len(self.testcases)) * 100

        self.logger.info("Naive Bayes classifier")
        self.logger.info("Correct classification : " + str(self.nb_correct))
        self.logger.info("Wrong classification : " + str(self.nb_wrong))
        self.logger.info("Accuracy classification : " +
                         str(int(self.nb_accuracy)))

        self.logger.info("Bag of Words classifier")
        self.logger.info("Correct : " + str(self.bw_correct))
        self.logger.info("Wrong classification : " + str(self.bw_wrong))
        self.logger.info("Accuracy : " + str(int(self.bw_accuracy)))

        self.logger.info("textblob classifier")
        self.logger.info("Correct : " + str(self.tb_correct))
        self.logger.info("Wrong classification : " + str(self.tb_wrong))
        self.logger.info("Accuracy : " + str(int(self.tb_accuracy)))

        self.file_html = self.file_html.replace("@nb_right",
                                                str(self.nb_correct))
        self.file_html = self.file_html.replace("@bw_right",
                                                str(self.bw_correct))
        self.file_html = self.file_html.replace("@tb_right",
                                                str(self.tb_correct))
        self.file_html = self.file_html.replace("@nb_wrong",
                                                str(self.nb_wrong))
        self.file_html = self.file_html.replace("@bw_wrong",
                                                str(self.bw_wrong))
        self.file_html = self.file_html.replace("@tb_wrong",
                                                str(self.tb_wrong))
        self.file_html = self.file_html.replace("@nb_accuracy",
                                                str(int(self.nb_accuracy)))
        self.file_html = self.file_html.replace("@bw_accuracy",
                                                str(int(self.bw_accuracy)))
        self.file_html = self.file_html.replace("@tb_accuracy",
                                                str(int(self.tb_accuracy)))
        self.file_html = self.file_html.replace("@total_sentences",
                                                str(len(self.testcases)))

        self.testcases["nb_results"] = {
            "correct": self.nb_correct,
            "wrong": self.nb_wrong,
            "accuracy": self.nb_accuracy
        }
        self.testcases["bw_results"] = {
            "correct": self.bw_correct,
            "wrong": self.bw_wrong,
            "accuracy": self.bw_accuracy
        }
        self.testcases["tb_results"] = {
            "correct": self.tb_correct,
            "wrong": self.tb_wrong,
            "accuracy": self.tb_accuracy
        }

        self.store_results()

    def store_results(self):
        with open('output\\comparison_data.json', 'w',
                  encoding="utf-8") as file_pointer:
            json.dump(self.testcases, file_pointer)
        with open('output\\output.html', 'w',
                  encoding="utf-8") as file_pointer:
            file_pointer.write(self.file_html)

    def test_for_bag(self, bag, actual_result):
        for sentence in bag:
            sentence = ' '.join(sentence)
            nb_result = self.nb.classify(sentence)
            bw_result = self.bw.classify(sentence)
            tb_result = self.classify_using_textblob(sentence)
            self.counter += 1
            self.testcases[self.counter] = {
                "sentence": sentence,
                "actual": actual_result,
                "nb_result": list(nb_result),
                "bw_result": list(bw_result),
                "tb_result": list(tb_result)
            }
            temp_html = self.html_structure
            temp_html = temp_html.replace("@sentence", str(sentence))
            temp_html = temp_html.replace("@actual_label", str(actual_result))
            temp_html = temp_html.replace("@nb_prediction", str(nb_result[1]))
            temp_html = temp_html.replace("@bw_prediction", str(bw_result[1]))
            temp_html = temp_html.replace("@tb_prediction", str(tb_result[1]))
            temp_html = temp_html.replace("@nb_label", str(nb_result[0]))
            temp_html = temp_html.replace("@bw_label", str(bw_result[0]))
            temp_html = temp_html.replace("@tb_label", str(tb_result[0]))
            temp_html = temp_html.replace("@nb_score", str(nb_result[2]))
            temp_html = temp_html.replace("@bw_score", str(bw_result[2]))
            temp_html = temp_html.replace("@tb_score", str(tb_result[2]))
            self.file_html = self.file_html + temp_html

            if nb_result[1] == actual_result:
                self.nb_correct += 1
            else:
                self.nb_wrong += 1

            if bw_result[1] == actual_result:
                self.bw_correct += 1
            else:
                self.bw_wrong += 1

            if tb_result[1] == actual_result:
                self.tb_correct += 1
            else:
                self.tb_wrong += 1

    def classify_using_textblob(self, sentence):
        '''
        classifies the sentence using textblob library
        '''
        text_blob = TextBlob(sentence)
        polarity = text_blob.sentiment[0]
        if polarity > 0:
            return ("positive", 1, polarity)
        if polarity < 0:
            return ("negative", 0, polarity)
        return ("neutral", -1, polarity)

    def load_html_structure(self):
        '''
        stores the data from dictionary to html file
        '''
        with open('res\\table_structure.html', 'r') as myfile:
            self.html_structure = myfile.read()

        with open('res\\table_header.html', 'r') as myfile:
            self.file_html = myfile.read()
示例#5
0
class NaiveBayes():
    '''
    implementation of Naive Bayes classifer
    '''
    def __init__(self, verbose=True, test_set_count=500, no_of_grams=1):
        self.logger = Logger('NaiveBayes',
                             'logs\\NaiveBayes.log',
                             is_verbose=verbose)
        self.verbose = verbose
        self.counts = dict()
        self.positive_bag = []
        self.negative_bag = []

        self.positve_test_bag = []
        self.negative_test_bag = []

        self.counts["test set"] = test_set_count
        self.counts["positive phrases"] = 0
        self.counts["negative phrases"] = 0
        self.counts["total sentences"] = 0
        self.counts["positive sentences"] = 0
        self.counts["negative sentences"] = 0

        self.no_of_grams = no_of_grams

        self.phrase_occurrences = dict()
        self.phrase_probabilities = dict()

    def ready(self):
        self.logger.info("starting Naive Bayers classifier")
        self.load_data()
        self.create_test_set()
        self.fit()
        # self.find_accuracy()
        self.logger.info("Naive Bayers classifier ready.")

    def classify(self, sentence):
        '''
        classifies a given sentence to positive or negative class
        '''
        positive_probablity, negative_probablity = self.find_conditional_probability(
            sentence)

        # if positive_probablity == 1 and negative_probablity == 1: #unable to classify a sentence
        #     self.logger.debug("sentence - " + sentence + " - is neutral")
        #     return ("neutral", -1, positive_probablity)

        if positive_probablity == 1 and negative_probablity != 1:
            self.logger.info("sentence - " + sentence + " - is negative")
            return ("negative", 0, negative_probablity)

        if positive_probablity != 1 and negative_probablity == 1:
            self.logger.info("sentence - " + sentence + " - is positive")
            return ("positive", 1, positive_probablity)

        if positive_probablity > negative_probablity:
            self.logger.info("sentence - " + sentence + " - is positive")
            return ("positive", 1, positive_probablity)

        if negative_probablity > positive_probablity:
            self.logger.info("sentence - " + sentence + " - is negative")
            return ("negative", 0, negative_probablity)

        if negative_probablity == positive_probablity:  #unable to classify a sentence
            self.logger.info("sentence - " + sentence + " - is neutral")
            self.logger.info("no sense can be deduced from this sentence")
            return ("neutral", -1, positive_probablity)

    def find_conditional_probability(self, sentence):
        '''
        finds the conditional probablity for a given sentence from phrase_probabilities
        '''
        sentence_str = sentence
        sentence = self.preprocess(sentence)

        sentence_positive_probablity = 1
        sentence_negative_probablity = 1

        positive_class_probability = self.counts[
            "positive sentences"] / self.counts["total sentences"]
        negative_class_probability = self.counts[
            "negative sentences"] / self.counts["total sentences"]

        sentence_positive_probablity *= positive_class_probability
        sentence_negative_probablity *= negative_class_probability

        kgrams = list()
        for k in range(self.no_of_grams, 0, -1):
            kgrams.extend(self.get_kgrams(sentence, k))

        for kgram in kgrams:  #this give around 80%
            phrase = ' '.join(kgram)
            sentence = ' '.join(sentence)
            if phrase in sentence and phrase in self.phrase_probabilities:
                phrase_positive_probability, phrase_negative_probability = self.phrase_probabilities[
                    phrase]
                count = sentence.count(phrase)
                self.logger.info(phrase + " " +
                                 str(phrase_positive_probability) + " " +
                                 str(phrase_negative_probability) + " " +
                                 str(count))
                sentence_positive_probablity *= phrase_positive_probability**count
                sentence_negative_probablity *= phrase_negative_probability**count
                sentence = sentence.replace(phrase, ' ')
            sentence = self.preprocess(sentence)

        # for kgram in kgrams: #this give 75%
        #     phrase = ' '.join(kgram)
        #     if phrase in self.phrase_probabilities:
        #         phrase_positive_probability, phrase_negative_probability = self.phrase_probabilities[phrase]
        #         self.logger.debug(phrase + " " + str(phrase_positive_probability) + " " + str(phrase_negative_probability))
        #         sentence_positive_probablity *= phrase_positive_probability
        #         sentence_negative_probablity *= phrase_negative_probability

        return sentence_positive_probablity, sentence_negative_probablity

    def fit(self):
        '''
        trains the model with sentences in positive and negative bags
        '''
        self.logger.info("training started")
        self.logger.info("total sentences : " +
                         str(self.counts["total sentences"]))
        self.logger.info("positive sentences : " +
                         str(self.counts["positive sentences"]))
        self.logger.info("negative sentences : " +
                         str(self.counts["negative sentences"]))

        self.get_occurrences_from_bags()
        self.logger.info("calculated occurrences")
        self.logger.info("unique phrases : " +
                         str(len(self.phrase_occurrences)))
        self.logger.info("phrases in positive class : " +
                         str(self.counts["positive phrases"]))
        self.logger.info("phrases in negative class : " +
                         str(self.counts["negative phrases"]))

        self.get_conditional_probabilities()
        self.logger.info("conditional probality for phrases calculated")
        self.logger.info("training completed")

    def get_conditional_probabilities(self):
        '''
        calculates the conditional probability for phrase|positive class and phrase|negative class
        '''
        total_unique_phrases = len(self.phrase_occurrences)
        for phrase in self.phrase_occurrences:
            positive_probablity = (self.phrase_occurrences[phrase][0] + 1) / (
                self.counts["positive phrases"] + total_unique_phrases)
            negative_probablity = (self.phrase_occurrences[phrase][1] + 1) / (
                self.counts["negative phrases"] + total_unique_phrases)
            self.phrase_probabilities[phrase] = [
                positive_probablity, negative_probablity
            ]

    def get_occurrences_from_bags(self):
        '''
        calculates the occurrences of the phrases
        '''
        self.get_occurrences_from_positive_bag()
        self.get_occurrences_from_negative_bag()

    def get_occurrences_from_positive_bag(self):
        '''
        calculates the occurrences of unigram, bigram, trigram and quadgram from positive bag
        '''
        for sentence in self.positive_bag:
            kgrams = list()
            for k in range(1, self.no_of_grams + 1):
                kgrams.extend(self.get_kgrams(sentence, k))
            for kgram in kgrams:
                phrase = ' '.join(kgram)
                self.counts["positive phrases"] += 1
                if phrase not in self.phrase_occurrences:
                    self.phrase_occurrences[phrase] = [
                        0, 0
                    ]  #[word occurrence in positive class, word occurrence in negative class]
                self.phrase_occurrences[phrase][0] += 1

    def get_occurrences_from_negative_bag(self):
        '''
        calculates the occurrences of unigram, bigram, trigram and quadgram from negative bag
        '''
        for sentence in self.negative_bag:
            kgrams = list()
            for k in range(1, self.no_of_grams + 1):
                kgrams.extend(self.get_kgrams(sentence, k))
            for kgram in kgrams:
                phrase = ' '.join(kgram)
                self.counts["negative phrases"] += 1
                if phrase not in self.phrase_occurrences:
                    self.phrase_occurrences[phrase] = [0, 0]
                self.phrase_occurrences[phrase][1] += 1

    def get_kgrams(self, sentence, k=1):
        '''
        return list of kgrams from a given sentence
        '''
        grams = list()
        for i in range(len(sentence)):
            grams.append(sentence[i:i + k])
            if i + k >= len(sentence):
                break
        return grams

    def create_test_set(self):
        '''
        randomly selects test sentences from positive and negative bags and making a uniform distribution of test sentences
        '''
        from numpy import random as np_random
        count = self.counts["test set"] // 2
        while (count != 0):
            index = np_random.random_integers(low=0,
                                              high=len(self.positive_bag) - 1)
            self.positve_test_bag.append(self.positive_bag.pop(index))
            index = np_random.random_integers(low=0,
                                              high=len(self.negative_bag) - 1)
            self.negative_test_bag.append(self.negative_bag.pop(index))
            count -= 1

        self.logger.info("test sentences selected")
        self.logger.info(
            "Total sentences for testing : " +
            str(len(self.positve_test_bag) + len(self.negative_test_bag)))
        self.logger.info("positive sentences for testing : " +
                         str(len(self.positve_test_bag)))
        self.logger.info("negative sentences for testing : " +
                         str(len(self.negative_test_bag)))

        self.counts["positive sentences"] = len(self.positive_bag)
        self.counts["negative sentences"] = len(self.negative_bag)
        self.counts["total sentences"] = len(self.positive_bag) + len(
            self.negative_bag)

    def load_data(self):
        '''
        loads the positive and negative sentences from filenames specified
        '''
        mixed_bag_paths = [
            'res\\dataset\\uci_dataset\\yelp_labelled.txt',
            'res\\dataset\\uci_dataset\\amazon_cells_labelled.txt',
            'res\\dataset\\uci_dataset\\imdb_labelled.txt'
        ]

        positive_bag_paths = [
            'res\\dataset\\polarity_dataset\\rt-polarity-pos.txt'
        ]
        negative_bag_paths = [
            'res\\dataset\\polarity_dataset\\rt-polarity-neg.txt'
        ]

        count_positive, count_negative = 0, 0
        for filename in mixed_bag_paths:
            for mixed_data in self.load_data_from_file(filename):
                sentence, label = mixed_data.split('\t')
                label = int(label)
                sentence = self.preprocess(sentence)
                if label == 1:  #if sentence is positive
                    self.positive_bag.append(sentence)
                    count_positive += 1
                else:
                    self.negative_bag.append(sentence)
                    count_negative += 1
        self.logger.debug("sentences from mixed bag imported")
        self.logger.debug("positive sentences : " + str(count_positive))
        self.logger.debug("negative sentences : " + str(count_negative))

        count_positive = 0
        for filename in positive_bag_paths:
            for sentence in self.load_data_from_file(filename):
                sentence = self.preprocess(sentence)
                self.positive_bag.append(sentence)
                count_positive += 1
        self.logger.debug("sentences from positive bag imported")
        self.logger.debug("positive sentences : " + str(count_positive))

        count_negative = 0
        for filename in negative_bag_paths:
            for sentence in self.load_data_from_file(filename):
                sentence = self.preprocess(sentence)
                self.negative_bag.append(sentence)
                count_negative += 1
        self.logger.debug("sentences from negative bag imported")
        self.logger.debug("negative sentences : " + str(count_negative))

        self.counts["positive sentences"] = len(self.positive_bag)
        self.counts["negative sentences"] = len(self.negative_bag)
        self.counts["total sentences"] = len(self.positive_bag) + len(
            self.negative_bag)

        self.logger.info("sentences imported")
        self.logger.info("Total sentences : " +
                         str(self.counts["total sentences"]))
        self.logger.info("positive sentences : " +
                         str(self.counts["positive sentences"]))
        self.logger.info("negative sentences : " +
                         str(self.counts["negative sentences"]))

    def load_data_from_file(self, filename, encoding="utf8"):
        '''
        load the data as a list from the specified filename
        '''
        with open(filename, encoding=encoding) as file:
            data = file.readlines()
        return data

    def preprocess(self, sentence):
        '''
        preprocess the sentence and return as a list of words
        '''
        sentence = self.tokenise(sentence)
        #sentence = self.remove_stop_words(sentence)
        return sentence

    def tokenise(self, sentence):
        '''
        convert the sentence to list of words
        '''
        sentence = self.clean(sentence)
        tokens = sentence.split(' ')
        filtered_tokens = list()
        for token in tokens:
            if len(token.strip()) != 0:
                filtered_tokens.append(token.strip())
        return filtered_tokens

    def clean(self, sentence):
        '''
        clean sentence by removing the ignored characters
        '''
        ignore_characters = '''\t\n&"`~@#$%^*;+=<>//.,()[]{}:;!?'''
        sentence = self.replace_characters(sentence, ignore_characters)
        return sentence.lower().strip()

    def replace_characters(self, text, characters):
        '''
        replaces the specified characters in text with blank space
        '''
        for char in characters:
            text = text.replace(char, ' ')
        return text

    def get_positive_test_bag(self):
        return self.positve_test_bag

    def get_negative_test_bag(self):
        return self.negative_test_bag

    def test_for_fish_guitar(self):
        positive_sentences = [
            "fish smoked fish", "fish line", "fish haul smoked"
        ]
        negative_sentences = ["guitar jazz line"]
        self.positive_bag = [
            sentence.split(" ") for sentence in positive_sentences
        ]
        self.negative_bag = [
            sentence.split(" ") for sentence in negative_sentences
        ]
        self.counts["total sentences"] = len(self.positive_bag) + len(
            self.negative_bag)
        self.counts["positive sentences"] = len(self.positive_bag)
        self.counts["negative sentences"] = len(self.negative_bag)

        self.get_occurrences_from_bags()
        self.get_conditional_probabilities()

        test_sentence = "line guitar jazz jazz"
        result = self.classify(sentence=test_sentence)
        self.logger.info(str(result))
        return result

    def find_accuracy(self):
        correct, wrong = 0, 0
        total = len(self.positve_test_bag) + len(self.negative_test_bag)

        _correct, _wrong = self.test_for_bag(self.positve_test_bag,
                                             actual_result=1)
        correct += _correct
        wrong += _wrong

        _correct, _wrong = self.test_for_bag(self.negative_test_bag,
                                             actual_result=0)
        correct += _correct
        wrong += _wrong

        self.accuracy = (correct / total) * 100
        self.logger.info("total test sentences : " + str(total))
        self.logger.info("correct output : " + str(correct))
        self.logger.info("wrong output : " + str(wrong))
        self.logger.info("accuracy (%) : " + str(int(self.accuracy)))

    def test_for_bag(self, bag, actual_result):
        self.logger.is_verbose = False
        correct, wrong = 0, 0
        for sentence in bag:
            sentence = ' '.join(sentence)
            result = self.classify(sentence=sentence)
            if result is None:
                self.logger.info("result is none : " + str(sentence))
                wrong += 1
                continue
            if result[1] == actual_result:
                correct += 1
            else:
                wrong += 1
        self.logger.is_verbose = True
        self.logger.debug("total test sentences in bag : " + str(len(bag)))
        self.logger.debug("correct output : " + str(correct))
        self.logger.debug("wrong output : " + str(wrong))
        self.logger.debug("accuracy (%) : " +
                          str(int((correct / len(bag)) * 100)))
        return correct, wrong
示例#6
0
from flask_cors import CORS
from flask import Flask
from flask_restful_swagger_2 import Api

from resources.login.login import Login
from resources.sender.sender import Sender
from resources.register.register import Register
from resources.verify.verify import Verify
from resources.puzzle.puzzle import Puzzle
from resources.balance.balance import Balance
from utilities.logger import Logger

logger = Logger(__name__)

app = Flask(__name__)

CORS(app)
api = Api(app, api_version='0.1')

api.add_resource(Login, "/login")
api.add_resource(Register, "/register")
api.add_resource(Puzzle, "/puzzle")
api.add_resource(Verify, "/verify/<string:user>/<int:pin>")
api.add_resource(Sender, "/send")
api.add_resource(Balance, "/user/balance")

if __name__ == '__main__':
    logger.info('Starting API')
    app.run(host="0.0.0.0", port=5000)